##################################3
###   Edcuation protject       ###
###   Jonne Kamphorst          ###
###   2022                     ###
###   Liss analysis            ###
###   Version: August 15       ###   
##################################3



###Libraries and data###
library(dplyr)
library(plm)
library(lmtest)
library(estimatr)
library(ggeffects)
library(extrafont)
library(stargazer)
library(ggplot2)
library(tidyverse)
library(texreg)
library(modelsummary)
library(plm)
library(ggpubr)
library(viridis)
library(did2s)
library(PanelMatch)
library(lme4)
library(gridExtra)
library(fect)
library(panelView)
library(haven)
library(ggeffects)

#font_import()
loadfonts(quiet = T, device = "pdf")
windowsFonts(Georgia = windowsFont("Georgia")) #load fonts for windows machines


## Set ggplot theme with the Georgia font
theme_set(theme_light(base_family = "Georgia", base_size = 12))




load("LISS/full_panel.Rda")

full_panel <- full_panel %>% distinct(nomem_encr, wave, .keep_all=T) %>%
  filter(year < 15) # remove the newest wave so we use the same data as in the submission (does not change results)

full_panel <- haven::zap_labels(full_panel)





################ Recode outcome variables ---------------------------------------------------------
# immi cv08a104 In the Netherlands, some people believe that immigrants are entitled to live here while retaining their own culture. Others feel that they should adapt entirely to Dutch culture. Where would you place yourself on a scale of 1 to 5, where 1 means that immigrants can retain their own culture and 5 means that they should adapt entirely?
# eu_uni cv08a105 Some people and political parties feel that European unification should go a step further. Others think that European unification has already gone too far. Where would you place yourself on a scale from 1 to 5, where 1 means that European unification should go further and 0 means that it has already gone too far?
# elec_rec cv08a054 Voted in last election


# Add voting behavior variables based on prospective voting (if election were today)
full_panel$vote_green <- ifelse(full_panel$elec_today == "GL",1 , 0)
full_panel$vote_green[is.na(full_panel$elec_today)] <- NA

full_panel$vote_prog <- ifelse(full_panel$elec_today == "GL" | full_panel$elec_today == "D66",1 , 0)
full_panel$vote_prog[is.na(full_panel$elec_today)] <- NA


full_panel$vote_RRP <- ifelse(full_panel$elec_today == "PVV" | 
                                 full_panel$elec_today == "FvD"|
                                 full_panel$elec_today == "LPF",1 , 0)
full_panel$vote_RRP[is.na(full_panel$elec_today)] <- NA


#continuous immigration variable and EU variable where higher values indicate progressive positions
full_panel$immi_progressive_high <-  max(full_panel$immi, na.rm=T) - full_panel$immi
full_panel$eu_progressive_high <-  max(full_panel$eu_uni, na.rm=T) - full_panel$eu_uni
full_panel$immi_for_many_pro <-  max(full_panel$for_many, na.rm=T) - full_panel$for_many


# Dummies for EU and immigration
full_panel$immi_pro <- NA
full_panel$immi_pro[full_panel$immi < 3] <- 1
full_panel$immi_pro[full_panel$immi > 2] <- 0

full_panel$eu_pro <- NA
full_panel$eu_pro[full_panel$eu_uni < 3] <- 1
full_panel$eu_pro[full_panel$eu_uni > 2] <- 0


# Merge D66 and GL thermostat scale
full_panel$term_d66gl <- (full_panel$term_d66 + full_panel$term_gl) / 2













# ################ Create immigration index using ICW ---------------------------------------
## Reverse the polarity for the questions where that is needed so that progressive is always the higher value
# immi_for_many_pro,  diver_cultures   asylum_eas 

full_panel$asylum_eas_rev <- max(full_panel$asylum_eas, na.rm=T) - full_panel$asylum_eas
full_panel$diver_cultures_rev <- max(full_panel$diver_cultures, na.rm=T) - full_panel$diver_cultures

# Add together using ICW
# Function to standardize columns of a matrix
# where you designate a standardization group
# (e.g., the control group in an experiment)
# with "sgroup", a logical vector.

matStand <- function(x, sgroup = rep(TRUE, nrow(x))){
  for(j in 1:ncol(x)){
    x[,j] <- (x[,j] - mean(x[sgroup,j]))/sd(x[sgroup,j])
  }
  return(x)
}

# Function that takes in data in matrix format and returns
# (i) IC weights and (ii) ICW index.
# Weights can be incorporated using the "wgts" argument.
# The "revcols" argument takes a vector indicating which columns,
# if any, should have values reversed (that is, standardized 
# values are multiplied by -1) prior to construction of the index. 

icwIndex <- function(	xmat,
                      wgts=rep(1, nrow(xmat)),
                      revcols = NULL,
                      sgroup = rep(TRUE, nrow(xmat))){
  X <- matStand(xmat, sgroup)
  if(length(revcols)>0){
    X[,revcols] <-  -1*X[,revcols]
  }
  i.vec <- as.matrix(rep(1,ncol(xmat)))
  Sx <- cov.wt(X, wt=wgts)[[1]]
  weights <- solve(t(i.vec)%*%solve(Sx)%*%i.vec)%*%t(i.vec)%*%solve(Sx)
  index <- t(solve(t(i.vec)%*%solve(Sx)%*%i.vec)%*%t(i.vec)%*%solve(Sx)%*%t(X))
  return(list(weights = weights, index = index))
}

####  Immigration index 
# Add the ICW in a special DF
holdem <- full_panel %>% dplyr::select(immi_for_many_pro, asylum_eas_rev, diver_cultures_rev, nomem_encr, wave) %>%
  drop_na(immi_for_many_pro, asylum_eas_rev, diver_cultures_rev)
matrix <- holdem %>% dplyr::select(immi_for_many_pro, asylum_eas_rev, diver_cultures_rev)
matrix <- as.matrix(matrix)
matrix_icw <- icwIndex(matrix)
holdem$immigration_icw_outcome <- as.numeric(matrix_icw$index)

holdem <- holdem %>% dplyr::select(nomem_encr, wave, immigration_icw_outcome)


full_panel <- full_join(full_panel, holdem)

# Standardize to run between 0 and 1
full_panel$immigration_icw_outcome <- (full_panel$immigration_icw_outcome - min(full_panel$immigration_icw_outcome, na.rm=T)) /
  (max(full_panel$immigration_icw_outcome,na.rm=T) - min(full_panel$immigration_icw_outcome,na.rm=T))

min(full_panel$immigration_icw_outcome,na.rm=T)
max(full_panel$immigration_icw_outcome,na.rm=T)
mean(full_panel$immigration_icw_outcome,na.rm=T)
sd(full_panel$immigration_icw_outcome,na.rm=T)












################ Add variables on respondent ---------------------------------------------------------
###### Education variables
# Respondent education dummy. This is for completing education!
full_panel$education <- as.numeric(full_panel$education)
full_panel$higher_edu <- NA
full_panel$higher_edu[full_panel$education >=5] <- 1 #"University #HBO is 5
full_panel$higher_edu[full_panel$education < 5] <- 0#"Non-univeristy"

# Create a dummy for the education someone is attending (note that this is attending OR completing)
full_panel$higher_edu_atten <- NA
full_panel$higher_edu_atten[full_panel$education_fromworkschool_atten >= 19 & full_panel$education_fromworkschool_atten != 27] <- 1 #"University #HBO is 5
full_panel$higher_edu_atten[full_panel$education_fromworkschool_atten < 19] <- 0#"Non-univeristy"



# Create a dummy for being in education
full_panel$ineducatio <- ifelse(full_panel$occupation==7 | full_panel$occupation==14, 1, 0)

## Get the later education level of someone. We use the normal education variable here as it's common in research to use this one.
#     However, we're also interested in what people study so we also use the attending variable
# Add the latest wave
full_panel <- full_panel %>% group_by(nomem_encr) %>%
  mutate(max_wave = max(wave)) %>%
  ungroup()

edu <- full_panel %>% group_by(nomem_encr) %>% filter(wave==max_wave) %>%
  ungroup() %>%
  dplyr::select(nomem_encr, education) %>% rename(max_education = education)
full_panel <- left_join(full_panel, edu)
rm(edu)

full_panel$max_degree_uni <- ifelse(full_panel$max_education > 4, 1, 0)



# Add a variable that captures if we have people in education
full_panel <- full_panel %>% group_by(nomem_encr) %>% 
  mutate(mean_ineducatio = mean(ineducatio,na.rm=T)) %>% 
  ungroup()
full_panel$have_inedu <- ifelse(full_panel$mean_ineducatio > 0,1,0)



##### Prepare the control variables for the OLS
#Gender
full_panel$women <- ifelse(full_panel$gender==2, 1, 0)

#Rural urban and ethnicity
full_panel$urban

full_panel <- full_panel %>% group_by(nomem_encr) %>%
  fill(urban, .direction = "downup") %>%
  fill(herkomstgroep, .direction = "downup") %>%
  ungroup()

full_panel$urban_dummy <- ifelse(full_panel$urban < 3, 1, 0)
full_panel$urban <- abs(5 - full_panel$urban)


full_panel$migration_background <- ifelse(full_panel$herkomstgroep!=0,1,0)


### Income
full_panel$nettoink[full_panel$nettoink<0] <- NA

#Fill in the NA
full_panel <- full_panel %>% group_by(nomem_encr) %>%
  fill(nettoink, .direction = "downup") %>%
  ungroup()

full_panel$nettoink_stand <- full_panel$nettoink + 1
full_panel$nettoink_stand <- full_panel$nettoink / 1000

full_panel$nettoink_stand


stand <- function(x){
  var <- (x - mean(x,na.rm=T))/sd(x,na.rm=T)
  return(var)
}

full_panel$nettoink_stand <- stand(full_panel$nettoink)

#Rural urban and ethnicity
full_panel <- full_panel %>% group_by(nomem_encr) %>%
  fill(urban, .direction = "downup") %>%
  fill(herkomstgroep, .direction = "downup") %>%
  ungroup()

full_panel$urban_dummy <- ifelse(full_panel$urban < 3, 1, 0)
full_panel$urban <- abs(5 - full_panel$urban)


full_panel$migration_background <- ifelse(full_panel$herkomstgroep!=0,1,0)


# age by 10 years
full_panel$age10 <- full_panel$age / 10


# survey year
full_panel$yearf <- as.factor(full_panel$year)
table(full_panel$employed, full_panel$year)

# Add calendar year to be able to add birth year 
full_panel$calyear <- NA
full_panel$calyear[full_panel$year==15] <- 2022
full_panel$calyear[full_panel$year==14] <- 2021
full_panel$calyear[full_panel$year==13] <- 2020
full_panel$calyear[full_panel$year == 12] <- 2019
full_panel$calyear[full_panel$year == 11] <- 2018
full_panel$calyear[full_panel$year == 10] <- 2017
full_panel$calyear[full_panel$year == 9] <- 2016
full_panel$calyear[full_panel$year == 8] <- 2015
full_panel$calyear[full_panel$year == 7] <- 2014
full_panel$calyear[full_panel$year == 6] <- 2013
full_panel$calyear[full_panel$year == 5] <- 2012
full_panel$calyear[full_panel$year == 4] <- 2011
full_panel$calyear[full_panel$year == 3] <- 2010
full_panel$calyear[full_panel$year == 2] <- 2009
full_panel$calyear[full_panel$year == 1] <- 2008

# add birth year
full_panel$birth_year <- full_panel$calyear - full_panel$age
full_panel <- full_panel %>% group_by(nomem_encr) %>% fill(birth_year, .direction=c("downup")) %>% 
  mutate(birth_year = round(mean(birth_year), 0)) %>% 
  ungroup() %>%dplyr::select(-calyear)

# Add the cohort in which someone was born
# With more familiar generational terms
full_panel$generation <- NA
full_panel$generation[full_panel$birth_year < 1946] <- 1
full_panel$generation[full_panel$birth_year > 1945 & full_panel$birth_year < 1956] <- 2
full_panel$generation[full_panel$birth_year > 1955 & full_panel$birth_year < 1966] <- 3
full_panel$generation[full_panel$birth_year > 1965 & full_panel$birth_year < 1976] <- 4
full_panel$generation[full_panel$birth_year > 1975 & full_panel$birth_year < 1986] <- 5
full_panel$generation[full_panel$birth_year > 1985 & full_panel$birth_year < 1996] <- 6
full_panel$generation[full_panel$birth_year > 1995] <- 7 # blocked out for further down




# Fix the being employed variable (starting in wave 20 the variable for being employed is gone from the data)
full_panel$employed[is.na(full_panel$employed) & full_panel$occupation < 4] <- 1 
full_panel$employed[is.na(full_panel$employed) & full_panel$occupation >= 4] <- 0 



## Clean the profession variable
# If NA people don't qualify for the question
full_panel$profession_404_clean <- full_panel$profession_404
full_panel$profession_404_clean[is.na(full_panel$profession_404_clean)] <- 10
full_panel$profession_404_clean <- as.factor(full_panel$profession_404_clean)



################ Add education variables of ties ---------------------------------------------------------
full_panel$pers1_edu[full_panel$pers1_edu==8] <- NA # 8 is don't know education of person
full_panel$pers2_edu[full_panel$pers2_edu==8] <- NA
full_panel$pers3_edu[full_panel$pers3_edu==8] <- NA
full_panel$pers4_edu[full_panel$pers4_edu==8] <- NA
full_panel$pers5_edu[full_panel$pers5_edu==8] <- NA

full_panel$pers1_edu[full_panel$pers1_edu<=1] <- NA # smaller than 1 is in some cases don't know education. Equal to 1 is when someone is still in education. This needs to go out to properly simulate the ties (done for R&R)
full_panel$pers2_edu[full_panel$pers2_edu<=1] <- NA
full_panel$pers3_edu[full_panel$pers3_edu<=1] <- NA
full_panel$pers4_edu[full_panel$pers4_edu<=1] <- NA
full_panel$pers5_edu[full_panel$pers5_edu<=1] <- NA

full_panel$pers1_edu_wpare <- full_panel$pers1_edu 
full_panel$pers2_edu_wpare <- full_panel$pers2_edu
full_panel$pers3_edu_wpare <- full_panel$pers3_edu
full_panel$pers4_edu_wpare <- full_panel$pers4_edu
full_panel$pers5_edu_wpare <- full_panel$pers5_edu

full_panel$pers1_edu[full_panel$pers1_know==2] <- NA # 2 is parents and we take those out in all models
full_panel$pers2_edu[full_panel$pers2_know==2] <- NA
full_panel$pers3_edu[full_panel$pers3_know==2] <- NA
full_panel$pers4_edu[full_panel$pers4_know==2] <- NA
full_panel$pers5_edu[full_panel$pers5_know==2] <- NA


# Treatment dummies for any friend with lower education
full_panel$friend_loweredu <- NA
full_panel$friend_loweredu[(full_panel$pers1_edu < 6 & full_panel$pers1_know > 2)| #education of the person is lower than HBO and the person is not partner or parent
                             (full_panel$pers2_edu < 6 & full_panel$pers2_know > 2)|
                             (full_panel$pers3_edu < 6 & full_panel$pers3_know > 2) |
                             (full_panel$pers4_edu < 6 & full_panel$pers4_know > 2) |
                             (full_panel$pers5_edu < 6 & full_panel$pers5_know > 2)] <- 1 # if any friend is lower educated, then 1
table(full_panel$friend_loweredu)

# If one of the friend questions is not missing and you don't have higher friends, then 0. I.e. this is only 0 if people report at least one higher educated tie that is not their parent or partner
full_panel$friend_loweredu[(full_panel$pers1_edu > 5 & full_panel$pers1_know > 2 & is.na(full_panel$friend_loweredu))] <- 0 #education is hbo or higher, pers is not part or par, and the variable is missing
full_panel$friend_loweredu[(full_panel$pers2_edu > 5 & full_panel$pers2_know > 2 & is.na(full_panel$friend_loweredu))] <- 0
full_panel$friend_loweredu[(full_panel$pers3_edu > 5 & full_panel$pers3_know > 2 & is.na(full_panel$friend_loweredu))] <- 0
full_panel$friend_loweredu[(full_panel$pers4_edu > 5 & full_panel$pers4_know > 2 & is.na(full_panel$friend_loweredu))] <- 0
full_panel$friend_loweredu[(full_panel$pers5_edu > 5 & full_panel$pers5_know > 2 & is.na(full_panel$friend_loweredu))] <- 0


# Treatment dummies for any friend with higher education. No parents or partner
full_panel$friend_higheredu <- NA
full_panel$friend_higheredu[(full_panel$pers1_edu > 5 & full_panel$pers1_know > 2)| 
                              (full_panel$pers2_edu > 5 & full_panel$pers2_know > 2)|
                              (full_panel$pers3_edu > 5 & full_panel$pers3_know > 2) |
                              (full_panel$pers4_edu > 5 & full_panel$pers4_know > 2) |
                              (full_panel$pers5_edu > 5 & full_panel$pers5_know > 2)] <- 1 # if any friend is lower educated, then 1
# this becomes 0 only if people report at least one lower-educated tie that is not their parent or partner. 
full_panel$friend_higheredu[(full_panel$pers1_edu < 6 & full_panel$pers1_know > 2 & is.na(full_panel$friend_higheredu))] <- 0
full_panel$friend_higheredu[(full_panel$pers2_edu < 6 & full_panel$pers2_know > 2 & is.na(full_panel$friend_higheredu))] <- 0
full_panel$friend_higheredu[(full_panel$pers3_edu < 6 & full_panel$pers3_know > 2 & is.na(full_panel$friend_higheredu))] <- 0
full_panel$friend_higheredu[(full_panel$pers4_edu < 6 & full_panel$pers4_know > 2 & is.na(full_panel$friend_higheredu))] <- 0
full_panel$friend_higheredu[(full_panel$pers5_edu < 6 & full_panel$pers5_know > 2 & is.na(full_panel$friend_higheredu))] <- 0




## Add a variable for having a friend (anyone except parent or partner) with a different education level
full_panel$a_friend_differentedu <- NA
full_panel$a_friend_differentedu[full_panel$higher_edu == 1 & full_panel$friend_loweredu == 1] <- "Other edu friend"
full_panel$a_friend_differentedu[full_panel$higher_edu == 1 & full_panel$friend_loweredu == 0] <- "No other edu friend"
full_panel$a_friend_differentedu[full_panel$higher_edu == 0 & full_panel$friend_higheredu == 1] <- "Other edu friend"
full_panel$a_friend_differentedu[full_panel$higher_edu == 0 & full_panel$friend_higheredu == 0] <- "No other edu friend"



# Total number of waves someone is in the panel
full_panel <- full_panel %>% group_by(nomem_encr) %>% mutate(number_of_waves =n() ) %>% ungroup()




## Create a sorting variable like in the ESS analysis. This is a variable that captures sorting without parents or partners. 
full_panel$sorting <- paste0(full_panel$a_friend_differentedu, full_panel$higher_edu)
full_panel$sorting[is.na(full_panel$a_friend_differentedu)] <- NA
full_panel$sorting[is.na(full_panel$higher_edu)] <- NA

full_panel$sorting <- factor(full_panel$sorting,
                             levels=c("No other edu friend1",
                                      "Other edu friend1",
                                      "Other edu friend0",
                                      "No other edu friend0"))







## Make a variable that captures the degree of network heterogeneity 

# Friend education dummy - one for lower and one for higher, so that you can more easily make a number of ties dummy
# Higher educated - NOTE that NA is put at zero which we can do because we control for total number of ties and subset in the models
full_panel$pers1_HU <- NA
full_panel$pers1_HU[full_panel$pers1_edu >= 6] <- 1
full_panel$pers1_HU[full_panel$pers1_edu < 6] <- 0
full_panel$pers1_HU[is.na(full_panel$pers1_edu)] <- 0 #Nas to 0 because you just add up these variables later

full_panel$pers2_HU <- NA
full_panel$pers2_HU[full_panel$pers2_edu >= 6] <- 1
full_panel$pers2_HU[full_panel$pers2_edu < 6] <- 0
full_panel$pers2_HU[is.na(full_panel$pers2_edu)] <- 0

full_panel$pers3_HU <- NA
full_panel$pers3_HU[full_panel$pers3_edu >= 6] <- 1
full_panel$pers3_HU[full_panel$pers3_edu < 6] <- 0
full_panel$pers3_HU[is.na(full_panel$pers3_edu)] <- 0

full_panel$pers4_HU <- NA
full_panel$pers4_HU[full_panel$pers4_edu >= 6] <- 1
full_panel$pers4_HU[full_panel$pers4_edu < 6] <- 0
full_panel$pers4_HU[is.na(full_panel$pers4_edu)] <- 0

full_panel$pers5_HU <- NA
full_panel$pers5_HU[full_panel$pers5_edu >= 6] <- 1
full_panel$pers5_HU[full_panel$pers5_edu < 6] <- 0
full_panel$pers5_HU[is.na(full_panel$pers5_edu)] <- 0

# Lower educated

full_panel$pers1_LU <- NA
full_panel$pers1_LU[full_panel$pers1_edu >= 6] <- 0
full_panel$pers1_LU[full_panel$pers1_edu < 6] <- 1
full_panel$pers1_LU[is.na(full_panel$pers1_edu)] <- 0

full_panel$pers2_LU <- NA
full_panel$pers2_LU[full_panel$pers2_edu >= 6] <- 0
full_panel$pers2_LU[full_panel$pers2_edu < 6] <- 1
full_panel$pers2_LU[is.na(full_panel$pers2_edu)] <- 0

full_panel$pers3_LU <- NA
full_panel$pers3_LU[full_panel$pers3_edu >= 6] <- 0
full_panel$pers3_LU[full_panel$pers3_edu < 6] <- 1
full_panel$pers3_LU[is.na(full_panel$pers3_edu)] <- 0

full_panel$pers4_LU <- NA
full_panel$pers4_LU[full_panel$pers4_edu >= 6] <- 0
full_panel$pers4_LU[full_panel$pers4_edu < 6] <- 1
full_panel$pers4_LU[is.na(full_panel$pers4_edu)] <- 0

full_panel$pers5_LU <- NA
full_panel$pers5_LU[full_panel$pers5_edu >= 6] <- 0
full_panel$pers5_LU[full_panel$pers5_edu < 6] <- 1
full_panel$pers5_LU[is.na(full_panel$pers5_edu)] <- 0

# First simply number of ties: variable for amount of low ties in network (high and low)

full_panel <- full_panel %>% mutate(number_high_ties = pers1_HU + pers2_HU + pers3_HU + pers4_HU + pers5_HU)

full_panel <- full_panel %>% mutate(number_low_ties = pers1_LU + pers2_LU + pers3_LU + pers4_LU + pers5_LU)

full_panel$has_highties <- ifelse(full_panel$number_high_ties > 0,1,0)
full_panel$has_lowties <- ifelse(full_panel$number_low_ties > 0,1,0)

# Variable for number of low(high) ties 'that are very dear to you'

full_panel <- full_panel %>% mutate(number_high_ties_dear = (pers1_HU * pers1_dear) + (pers2_HU * pers2_dear)
                        + (pers3_HU * pers3_dear) + (pers4_HU * pers4_dear) + (pers5_HU * pers5_dear))

full_panel <- full_panel %>% mutate(number_low_ties_dear = (pers1_LU * pers1_dear) + (pers2_LU * pers2_dear)
                        + (pers3_LU * pers3_dear) + (pers4_LU * pers4_dear) + (pers5_LU * pers5_dear))


# Variable for total number of ties, to later control for in models
full_panel <- full_panel %>% mutate(total_ties = number_low_ties + number_high_ties)


## Add the proportion of high ties
full_panel$number_high_ties_proportion <- full_panel$number_high_ties / full_panel$total_ties



#### Repeat the same process but now include VWO en HAVO in high school
full_panel$pers1_HU_wvwo <- NA
full_panel$pers1_HU_wvwo[full_panel$pers1_edu >= 6 | full_panel$pers1_edu == 4] <- 1
full_panel$pers1_HU_wvwo[full_panel$pers1_edu < 6 & full_panel$pers1_edu != 4] <- 0
full_panel$pers1_HU_wvwo[is.na(full_panel$pers1_edu)] <- 0

full_panel$pers2_HU_wvwo <- NA
full_panel$pers2_HU_wvwo[full_panel$pers2_edu >= 6 | full_panel$pers2_edu == 4] <- 1
full_panel$pers2_HU_wvwo[full_panel$pers2_edu < 6 & full_panel$pers2_edu != 4] <- 0
full_panel$pers2_HU_wvwo[is.na(full_panel$pers2_edu)] <- 0

full_panel$pers3_HU_wvwo <- NA
full_panel$pers3_HU_wvwo[full_panel$pers3_edu >= 6 | full_panel$pers3_edu == 4] <- 1
full_panel$pers3_HU_wvwo[full_panel$pers3_edu < 6 & full_panel$pers3_edu != 4] <- 0
full_panel$pers3_HU_wvwo[is.na(full_panel$pers3_edu)] <- 0

full_panel$pers4_HU_wvwo <- NA
full_panel$pers4_HU_wvwo[full_panel$pers4_edu >= 6 | full_panel$pers4_edu == 4] <- 1
full_panel$pers4_HU_wvwo[full_panel$pers4_edu < 6 & full_panel$pers4_edu != 4] <- 0
full_panel$pers4_HU_wvwo[is.na(full_panel$pers4_edu)] <- 0

full_panel$pers5_HU_wvwo <- NA
full_panel$pers5_HU_wvwo[full_panel$pers5_edu >= 6 | full_panel$pers5_edu == 4] <- 1
full_panel$pers5_HU_wvwo[full_panel$pers5_edu < 6 & full_panel$pers5_edu != 4] <- 0
full_panel$pers5_HU_wvwo[is.na(full_panel$pers5_edu)] <- 0

# Lower educated

full_panel$pers1_LU_wvwo <- NA
full_panel$pers1_LU_wvwo[full_panel$pers1_edu >= 6 | full_panel$pers1_edu == 4] <- 0
full_panel$pers1_LU_wvwo[full_panel$pers1_edu < 6 & full_panel$pers1_edu != 4] <- 1
full_panel$pers1_LU_wvwo[is.na(full_panel$pers1_edu)] <- 0

full_panel$pers2_LU_wvwo <- NA
full_panel$pers2_LU_wvwo[full_panel$pers2_edu >= 6 | full_panel$pers2_edu == 4] <- 0
full_panel$pers2_LU_wvwo[full_panel$pers2_edu < 6 & full_panel$pers2_edu != 4] <- 1
full_panel$pers2_LU_wvwo[is.na(full_panel$pers2_edu)] <- 0

full_panel$pers3_LU_wvwo <- NA
full_panel$pers3_LU_wvwo[full_panel$pers3_edu >= 6 | full_panel$pers3_edu == 4] <- 0
full_panel$pers3_LU_wvwo[full_panel$pers3_edu < 6 & full_panel$pers3_edu != 4] <- 1
full_panel$pers3_LU_wvwo[is.na(full_panel$pers3_edu)] <- 0

full_panel$pers4_LU_wvwo <- NA
full_panel$pers4_LU_wvwo[full_panel$pers4_edu >= 6 | full_panel$pers4_edu == 4] <- 0
full_panel$pers4_LU_wvwo[full_panel$pers4_edu < 6 & full_panel$pers4_edu != 4] <- 1
full_panel$pers4_LU_wvwo[is.na(full_panel$pers4_edu)] <- 0

full_panel$pers5_LU_wvwo <- NA
full_panel$pers5_LU_wvwo[full_panel$pers5_edu >= 6 | full_panel$pers5_edu == 4] <- 0
full_panel$pers5_LU_wvwo[full_panel$pers5_edu < 6 & full_panel$pers5_edu != 4] <- 1
full_panel$pers5_LU_wvwo[is.na(full_panel$pers5_edu)] <- 0

# First simply number of ties: variable for amount of low ties in network (high and low)

full_panel <- full_panel %>% mutate(number_high_ties_wvwo = pers1_HU_wvwo + pers2_HU_wvwo + pers3_HU_wvwo + pers4_HU_wvwo + pers5_HU_wvwo)

full_panel <- full_panel %>% mutate(number_low_ties_wvwo = pers1_LU_wvwo + pers2_LU_wvwo + pers3_LU_wvwo + pers4_LU_wvwo + pers5_LU_wvwo)


# Variable for total number of ties, to later control for in models
full_panel <- full_panel %>% mutate(total_ties_wvwo = number_low_ties_wvwo + number_high_ties_wvwo)




# Number of ties only focusing on everyone minus parents and partners
full_panel$pers1_friends_edu  <- full_panel$pers1_edu
full_panel$pers2_friends_edu <- full_panel$pers2_edu
full_panel$pers3_friends_edu <- full_panel$pers3_edu
full_panel$pers4_friends_edu <- full_panel$pers4_edu
full_panel$pers5_friends_edu <- full_panel$pers5_edu

# If the person is '2' it's someone's parent and '1' is partner. (We had already taken out 2 so that;s just double now)
full_panel$pers1_friends_edu[full_panel$pers1_know == 1 | full_panel$pers1_know == 2] <- NA 
full_panel$pers2_friends_edu[full_panel$pers2_know == 1 | full_panel$pers2_know == 2] <- NA
full_panel$pers3_friends_edu[full_panel$pers3_know == 1 | full_panel$pers3_know == 2] <- NA
full_panel$pers4_friends_edu[full_panel$pers4_know == 1 | full_panel$pers4_know == 2] <- NA
full_panel$pers5_friends_edu[full_panel$pers5_know == 1 | full_panel$pers5_know == 2] <- NA

full_panel$pers1_friend_HU <- NA
full_panel$pers1_friend_HU[full_panel$pers1_friends_edu >= 6] <- 1
full_panel$pers1_friend_HU[full_panel$pers1_friends_edu < 6] <- 0
full_panel$pers1_friend_HU[is.na(full_panel$pers1_friends_edu)] <- 0

full_panel$pers2_friend_HU <- NA
full_panel$pers2_friend_HU[full_panel$pers2_friends_edu >= 6] <- 1
full_panel$pers2_friend_HU[full_panel$pers2_friends_edu < 6] <- 0
full_panel$pers2_friend_HU[is.na(full_panel$pers2_friends_edu)] <- 0

full_panel$pers3_friend_HU <- NA
full_panel$pers3_friend_HU[full_panel$pers3_friends_edu >= 6] <- 1
full_panel$pers3_friend_HU[full_panel$pers3_friends_edu < 6] <- 0
full_panel$pers3_friend_HU[is.na(full_panel$pers3_friends_edu)] <- 0

full_panel$pers4_friend_HU <- NA
full_panel$pers4_friend_HU[full_panel$pers4_friends_edu >= 6] <- 1
full_panel$pers4_friend_HU[full_panel$pers4_friends_edu < 6] <- 0
full_panel$pers4_friend_HU[is.na(full_panel$pers4_friends_edu)] <- 0

full_panel$pers5_friend_HU <- NA
full_panel$pers5_friend_HU[full_panel$pers5_friends_edu >= 6] <- 1
full_panel$pers5_friend_HU[full_panel$pers5_friends_edu < 6] <- 0
full_panel$pers5_friend_HU[is.na(full_panel$pers5_friends_edu)] <- 0

# Lower educated

full_panel$pers1_friend_LU <- NA
full_panel$pers1_friend_LU[full_panel$pers1_friends_edu >= 6] <- 0
full_panel$pers1_friend_LU[full_panel$pers1_friends_edu < 6] <- 1
full_panel$pers1_friend_LU[is.na(full_panel$pers1_friends_edu)] <- 0

full_panel$pers2_friend_LU <- NA
full_panel$pers2_friend_LU[full_panel$pers2_friends_edu >= 6] <- 0
full_panel$pers2_friend_LU[full_panel$pers2_friends_edu < 6] <- 1
full_panel$pers2_friend_LU[is.na(full_panel$pers2_friends_edu)] <- 0

full_panel$pers3_friend_LU <- NA
full_panel$pers3_friend_LU[full_panel$pers3_friends_edu >= 6] <- 0
full_panel$pers3_friend_LU[full_panel$pers3_friends_edu < 6] <- 1
full_panel$pers3_friend_LU[is.na(full_panel$pers3_friends_edu)] <- 0

full_panel$pers4_friend_LU <- NA
full_panel$pers4_friend_LU[full_panel$pers4_friends_edu >= 6] <- 0
full_panel$pers4_friend_LU[full_panel$pers4_friends_edu < 6] <- 1
full_panel$pers4_friend_LU[is.na(full_panel$pers4_friends_edu)] <- 0

full_panel$pers5_friend_LU <- NA
full_panel$pers5_friend_LU[full_panel$pers5_friends_edu >= 6] <- 0
full_panel$pers5_friend_LU[full_panel$pers5_friends_edu < 6] <- 1
full_panel$pers5_friend_LU[is.na(full_panel$pers5_friends_edu)] <- 0

# First simply number of ties: variable for amount of low ties in network (high and low)

full_panel <- full_panel %>% mutate(number_high_ties_friends = pers1_friend_HU + pers2_friend_HU + pers3_friend_HU + pers4_friend_HU + pers5_friend_HU)

full_panel <- full_panel %>% mutate(number_low_ties_friends = pers1_friend_LU + pers2_friend_LU + pers3_friend_LU + pers4_friend_LU + pers5_friend_LU)



# Variable for total number of ties, to later control for in models
full_panel <- full_panel %>% mutate(total_ties_friends = number_low_ties_friends + number_high_ties_friends)









## Add a variable that captures if one or both of someone's parents are higher educated
full_panel$parents_HU <- NA
full_panel$parents_HU[full_panel$pers1_edu_wpare >= 6 & full_panel$pers1_know == 2] <- 1 #if the person is higher educated and a parent then 1
full_panel$parents_HU[full_panel$pers2_edu_wpare >= 6 & full_panel$pers2_know == 2] <- 1
full_panel$parents_HU[full_panel$pers3_edu_wpare >= 6 & full_panel$pers3_know == 2] <- 1
full_panel$parents_HU[full_panel$pers4_edu_wpare >= 6 & full_panel$pers4_know == 2] <- 1
full_panel$parents_HU[full_panel$pers5_edu_wpare >= 6 & full_panel$pers5_know == 2] <- 1

full_panel$parents_HU <- ifelse(is.na(full_panel$parents_HU) & (full_panel$pers1_know == 2 |
                                                                  full_panel$pers2_know == 2 |
                                                                  full_panel$pers3_know == 2 |
                                                                  full_panel$pers4_know == 2 |
                                                                  full_panel$pers5_know == 2), 0, full_panel$parents_HU)

# Then for each respondent, code if they report this anywhere in the data as a constant variable
full_panel <- full_panel %>% group_by(nomem_encr) %>%
  mutate(working = mean(parents_HU,na.rm=T),
         parents_HU_anyw = ifelse(working>0, 1, 
                             ifelse(working<=0, 0, NA))) %>%
  dplyr::select(-working) %>%
  ungroup()






## Make index: one number for each observation
full_panel <- 
  full_panel %>% 
  group_by(nomem_encr) %>% 
  mutate(index = 1:n()) %>% 
  ungroup()

full_panel$index <- full_panel$index - 1 #set the first year as 0 for multilevel models

# Number of years someone is in the panel (if someone skips a year, then this will still be counted)
full_panel <- full_panel %>% 
  group_by(nomem_encr) %>% 
  mutate(min_year = min(year), # earliest year someone is in the panel
         number_of_years = year - min_year) %>% 
  ungroup()



# Number of years someone is in the panel (if someone skips a year, then this will still be counted)
full_panel <- full_panel %>% 
  group_by(nomem_encr) %>% 
  mutate(max_number_of_years = max(number_of_years)) %>% 
  ungroup()



# Add a sorting variables based on the number of ties and education level of a respondent
full_panel$number_low_ties_bins <- full_panel$number_low_ties
full_panel$number_low_ties_bins[full_panel$number_low_ties_bins > 2] <- "3 or more ties"

full_panel$number_high_ties_bins <- full_panel$number_high_ties
full_panel$number_high_ties_bins[full_panel$number_high_ties_bins > 2] <- "3 or more ties"

full_panel$number_other_ties_bins <- NA
full_panel$number_other_ties_bins <- full_panel$number_low_ties_bins #paste in number of low ties
full_panel$number_other_ties_bins[full_panel$higher_edu_atten==0] <- NA #for people who are lower educated, low ties are not other ties thus set to NA
full_panel$number_other_ties_bins[is.na(full_panel$number_other_ties_bins)] <- full_panel$number_high_ties_bins[is.na(full_panel$number_other_ties_bins)] #the NAs are now lower educated people. Thus merge in number og high ties for all these NAs

# Drop respondents where we don't have tie or education information
full_panel_save <- full_panel
full_panel <- full_panel %>% drop_na(number_low_ties, number_high_ties, higher_edu_atten)

# add the sorting variable 
full_panel$sorting_number_ties <- paste0(full_panel$number_other_ties_bins,"__", full_panel$higher_edu_atten)


# Add a sorting variables based on the number of ties and education level of a respondent. FOR FRIENDS ONLY
full_panel$number_low_ties_bins_friends <- full_panel$number_low_ties_friends
full_panel$number_low_ties_bins_friends[full_panel$number_low_ties_bins_friends > 2] <- "3 or more ties"

full_panel$number_high_ties_bins_friends <- full_panel$number_high_ties_friends
full_panel$number_high_ties_bins_friends[full_panel$number_high_ties_bins_friends > 2] <- "3 or more ties"

full_panel$number_other_ties_bins_friends <- NA
full_panel$number_other_ties_bins_friends <- full_panel$number_low_ties_bins_friends #paste in number of low ties
full_panel$number_other_ties_bins_friends[full_panel$higher_edu_atten==0] <- NA #for people who are lower educated, low ties are not other ties thus set to NA
full_panel$number_other_ties_bins_friends[is.na(full_panel$number_other_ties_bins_friends)] <- full_panel$number_high_ties_bins_friends[is.na(full_panel$number_other_ties_bins_friends)] #the NAs are now lower educated people. Thus merge in number og high ties for all these NAs


# add the sorting variable FOR FRIENDS ONLY
full_panel$sorting_number_ties_friends <- paste0(full_panel$number_other_ties_bins_friends,"__", full_panel$higher_edu_atten)
full_panel$sorting_number_ties_friends <- factor(full_panel$sorting_number_ties_friends, levels=c("0__1", "1__1", "2__1", "3 or more ties__1", "3 or more ties__0", "2__0", "1__0", "0__0"))


# continuous version
full_panel$number_other_ties_cont <- NA
full_panel$number_other_ties_cont <- full_panel$number_low_ties #paste in number of low ties
full_panel$number_other_ties_cont[full_panel$higher_edu_atten==0] <- NA #for people who are lower educated, low ties are not other ties thus set to NA
full_panel$number_other_ties_cont[is.na(full_panel$number_other_ties_cont)] <- full_panel$number_high_ties[is.na(full_panel$number_other_ties_cont)] #the NAs are now lower educated people. Thus merge in number og high ties for all these NAs


# continuous version for the variable with VWO included
full_panel$number_other_ties_cont_wvwo <- NA
full_panel$number_other_ties_cont_wvwo <- full_panel$number_low_ties_wvwo #paste in number of low ties
full_panel$number_other_ties_cont_wvwo[full_panel$higher_edu_atten==0] <- NA #for people who are lower educated, low ties are not other ties thus set to NA
full_panel$number_other_ties_cont_wvwo[is.na(full_panel$number_other_ties_cont_wvwo)] <- full_panel$number_high_ties_wvwo[is.na(full_panel$number_other_ties_cont_wvwo)] #the NAs are now lower educated people. Thus merge in number og high ties for all these NAs

# Create a variable for number of ties when someone was 16 or 17
xx <- full_panel %>% filter(age == 16 | age == 17) %>% group_by(nomem_encr) %>% 
  mutate(mean_number_high_ties_wvwo_17 = mean(number_high_ties_wvwo, na.rm=T),
         mean_number_low_ties_wvwo_17 = mean(number_low_ties_wvwo, na.rm=T),
         mean_number_high_ties_17 = mean(number_high_ties, na.rm=T),
         mean_number_low_ties_17 = mean(number_low_ties, na.rm=T)) %>% ungroup() %>%
  dplyr::select(nomem_encr, mean_number_high_ties_wvwo_17, mean_number_low_ties_wvwo_17, mean_number_high_ties_17, mean_number_low_ties_17) %>%
  distinct(nomem_encr, .keep_all=T) 

full_panel <- left_join(full_panel, xx)
rm(xx)

#Turn into a useful dummy
full_panel$had_hightieswvwo_when17 <- ifelse(full_panel$mean_number_high_ties_wvwo_17 > 0,1,0)
full_panel$had_highties_when17 <- ifelse(full_panel$mean_number_high_ties_17 > 0,1,0)


# Add a continuous variable for talking about politics for the mediation analysis
full_panel <- full_panel %>% mutate(var1 = (pers1_HU * pers1_talk) + (pers2_HU * pers2_talk)
                                    + (pers3_HU * pers3_talk) + (pers4_HU * pers4_talk) + (pers5_HU * pers5_talk))

full_panel <- full_panel %>% mutate(var2 = (pers1_LU * pers1_talk) + (pers2_LU * pers2_talk)
                                    + (pers3_LU * pers3_talk) + (pers4_LU * pers4_talk) + (pers5_LU * pers5_talk))

full_panel$mediation_talking_numberotherties <- NA
full_panel$mediation_talking_numberotherties[full_panel$higher_edu_atten==0] <- full_panel$var1[full_panel$higher_edu_atten==0]
full_panel$mediation_talking_numberotherties[full_panel$higher_edu_atten==1] <- full_panel$var2[full_panel$higher_edu_atten==1]
full_panel$mediation_talking_numberotherties <- full_panel$mediation_talking_numberotherties / full_panel$number_other_ties_cont

full_panel <- full_panel %>% dplyr::select(-var1, -var2)







# Add the sorting variable for a respondent in the moment they first enter the panel as well as some control variables
xx <- full_panel %>% group_by(nomem_encr) %>% filter(year == min(year)) %>% 
  dplyr::select(nomem_encr, sorting_number_ties,
         employed, nettoink_stand, urban_dummy,
         migration_background) %>%
  rename(min_sorting_number_ties = sorting_number_ties,
         min_employed = employed,
         min_nettoink_stand = nettoink_stand,
         min_urban_dummy = urban_dummy,
         min_migration_background = migration_background)

full_panel <- left_join(full_panel, xx)
rm(xx)

# set baselines
full_panel$min_sorting_number_ties <- factor(full_panel$min_sorting_number_ties)
full_panel$min_sorting_number_ties <- relevel(full_panel$min_sorting_number_ties, ref="0__1")

full_panel$sorting_number_ties <- factor(full_panel$sorting_number_ties, 
                                         levels = c("0__1", "1__1", "2__1", "3 or more ties__1",
                                                    "3 or more ties__0", "2__0", "1__0", "0__0"))


# Add if someone has a change in education level and if they will ever get treated
full_panel <- full_panel %>% group_by(nomem_encr) %>% mutate(mean_higher_edu_atten = mean(higher_edu_atten, na.rm=T)) %>% ungroup()
full_panel$has_change_in_higher_edu_atten <- ifelse(full_panel$mean_higher_edu_atten != 0 & full_panel$mean_higher_edu_atten != 1, 1, 0 )
full_panel$ever_treated_higher_edu_atten <- ifelse(full_panel$mean_higher_edu_atten > 0, 1, 0 )






## Add if someone has a change in the number of high ties. 
full_panel <- full_panel %>% group_by(nomem_encr) %>%
  mutate(number_high_ties_lag = dplyr::lag(number_high_ties)) %>% ungroup()

full_panel$change_number_high_ties <- NA
full_panel$change_number_high_ties <- ifelse(full_panel$number_high_ties > full_panel$number_high_ties_lag, 1, full_panel$change_number_high_ties) #if you see an increase, then you get treated
full_panel$change_number_high_ties <- ifelse(full_panel$number_high_ties < full_panel$number_high_ties_lag, 3, full_panel$change_number_high_ties) #for a decrease you get a 3, these will become missing values

full_panel <- full_panel %>% group_by(nomem_encr) %>% 
  mutate(loses_a_hightie = ifelse(3 %in% change_number_high_ties, "loses tie", "doesn't loose"),
         gets_a_hightie = ifelse(1 %in% change_number_high_ties, "gains tie", "doesn't gain")) %>%
  fill(change_number_high_ties, .direction = "down") %>% 
  ungroup()

full_panel$change_number_high_ties[!is.na(full_panel$number_high_ties) & is.na(full_panel$change_number_high_ties)] <- 0
full_panel$change_number_high_ties[full_panel$change_number_high_ties==3] <- NA


full_panel <- full_panel %>% group_by(nomem_encr, age) %>% 
  mutate(gets_a_hightie_numeric = ifelse(1 %in% change_number_high_ties, 1, 0)) %>%
  ungroup()

# Percentage of people in sample that experience change in ties
full_panel %>% group_by(nomem_encr) %>% mutate(haschange = ifelse(mean(gets_a_hightie_numeric,na.rm=T)>0,1,0)) %>%
  ungroup() %>%
  summarize(mean(haschange,na.rm=T))


full_panel %>% group_by(nomem_encr) %>% mutate(haschange = ifelse(mean(gets_a_hightie_numeric,na.rm=T)>0,1,0)) %>%
  ungroup() %>% group_by(higher_edu) %>%
  summarize(mean(haschange,na.rm=T))


## Graph for age and when people gain a tie
full_panel_plotdf <- full_panel %>% filter(!is.na(gets_a_hightie_numeric)) %>%
  group_by(age) %>%
  mutate(n_tot = n()) %>% filter(gets_a_hightie_numeric == 1) %>%
  summarize(perc_gets_tie = n()/n_tot * 100) %>% distinct(age, .keep_all = T)

plot <- ggplot(subset(full_panel_plotdf, age < 80), aes(x=age, y=perc_gets_tie)) + geom_point() +
  labs(x="Age", y="Percentage of people that experience a tie increase at a given age")
plot
ggsave(filename="LISS/plots/figa3_change_ties.png", plot=plot, dpi=300, width=6, height=6.5)
rm(full_panel_plotdf, plot)
# Figure a3





## Graph for age AND EDUCATION and when people gain a tie
full_panel_plotdf <- full_panel %>% filter(!is.na(gets_a_hightie_numeric)) %>%
  group_by(age, ever_treated_higher_edu_atten) %>%
  mutate(n_tot = n()) %>% filter(gets_a_hightie_numeric == 1) %>%
  summarize(perc_gets_tie = n()/n_tot * 100) %>% distinct(age, .keep_all = T)
full_panel_plotdf$ever_treated_higher_edu_atten <- as.character(full_panel_plotdf$ever_treated_higher_edu_atten)
full_panel_plotdf$ever_treated_higher_edu_atten <- ifelse(full_panel_plotdf$ever_treated_higher_edu_atten ==1, "Has attended/is attending/will attend higher-education", "Will not attend higher education")

plot <- ggplot(subset(full_panel_plotdf, age < 80), aes(x=age, y=perc_gets_tie, color=ever_treated_higher_edu_atten, shape=ever_treated_higher_edu_atten)) + geom_point() +
  labs(x="Age", y="Percentage of people that experience a tie increase at a given age", color="", shape="") + theme(legend.position = "bottom") + guides(color=guide_legend(nrow=2,byrow=TRUE))
plot

ggsave(filename="LISS/plots/figa3_change_ties_educ.png", plot=plot, dpi=300, width=6, height=6.5)
rm(full_panel_plotdf, plot)
# Figure a3





# Add a regression predicting who gets a high ties with control variables from cross-sectional part as predictors
reg <- lm_robust(gets_a_hightie_numeric~nettoink_stand + gender + age10 + 
                   urban_dummy + migration_background + employed + parents_HU_anyw + higher_edu, data=full_panel, cluster=nomem_encr)

model <- texreg(l=list(reg), use.packages=F,
                stars=c(0.05, 0.01, 0.001), 
                digits=3, include.ci = FALSE, booktabs=T,
                custom.model.names = c("DV: Getting a high tie (0-1)"),
                custom.coef.names = c("Intercept", "Income (standardized)", "Gender", "Age by 10 years", "Urban", "Migration background", "Employed", "Any parent higher educated", "Higher educated"),
                reorder.coef=c(2,3,4,5,6,7,8,9,1),
                caption.above=T,
                label="liss_whogetstie",
                caption="LISS - Predicting experiencing a tie increase using pooled OLS.",
                threeparttable = TRUE,
                float.pos="htpb!",
                custom.note="\\item %stars. The outcome captures whether people experience a tie increase (getting a new higher-educated tie) at some point in the panel. Income is standardized so that a one-unit increase is a standard deviation increase in income. We run a pooled OLS model. The outcome is constant per respondent but the predictor variables may vary over time.")

print(model, file = "LISS/tables/taba13_liss_whogetstie.tex")


rm(reg)





##### Add a control for the professional composition of someone's network
# Add dummy variables for the profession categories for someone's network
full_panel$netprof_manager_pers1 <- ifelse(full_panel$pers1_prof==2 | full_panel$pers1_prof==4,1,0)
full_panel$netprof_manager_pers2 <- ifelse(full_panel$pers2_prof==2 | full_panel$pers2_prof==4,1,0)
full_panel$netprof_manager_pers3 <- ifelse(full_panel$pers3_prof==2 | full_panel$pers3_prof==4,1,0)
full_panel$netprof_manager_pers4 <- ifelse(full_panel$pers4_prof==2 | full_panel$pers4_prof==4,1,0)
full_panel$netprof_manager_pers5 <- ifelse(full_panel$pers5_prof==2 | full_panel$pers5_prof==4,1,0)

full_panel$netprof_think_pers1 <- ifelse(full_panel$pers1_prof==1 | full_panel$pers1_prof==3,1,0)
full_panel$netprof_think_pers2 <- ifelse(full_panel$pers2_prof==1 | full_panel$pers2_prof==3,1,0)
full_panel$netprof_think_pers3 <- ifelse(full_panel$pers3_prof==1 | full_panel$pers3_prof==3,1,0)
full_panel$netprof_think_pers4 <- ifelse(full_panel$pers4_prof==1 | full_panel$pers4_prof==3,1,0)
full_panel$netprof_think_pers5 <- ifelse(full_panel$pers5_prof==1 | full_panel$pers5_prof==3,1,0)

full_panel$netprof_otwhitecol_pers1 <- ifelse(full_panel$pers1_prof==5,1,0)
full_panel$netprof_otwhitecol_pers2 <- ifelse(full_panel$pers2_prof==5,1,0)
full_panel$netprof_otwhitecol_pers3 <- ifelse(full_panel$pers3_prof==5,1,0)
full_panel$netprof_otwhitecol_pers4 <- ifelse(full_panel$pers4_prof==5,1,0)
full_panel$netprof_otwhitecol_pers5 <- ifelse(full_panel$pers5_prof==5,1,0)

full_panel$netprof_manual_pers1 <- ifelse(full_panel$pers1_prof==6 | full_panel$pers1_prof==7 | full_panel$pers1_prof==8 | full_panel$pers1_prof==9, 1, 0)
full_panel$netprof_manual_pers2 <- ifelse(full_panel$pers2_prof==6 | full_panel$pers2_prof==7 | full_panel$pers2_prof==8 | full_panel$pers2_prof==9, 1, 0)
full_panel$netprof_manual_pers3 <- ifelse(full_panel$pers3_prof==6 | full_panel$pers3_prof==7 | full_panel$pers3_prof==8 | full_panel$pers3_prof==9, 1, 0)
full_panel$netprof_manual_pers4 <- ifelse(full_panel$pers4_prof==6 | full_panel$pers4_prof==7 | full_panel$pers4_prof==8 | full_panel$pers4_prof==9, 1, 0)
full_panel$netprof_manual_pers5 <- ifelse(full_panel$pers5_prof==6 | full_panel$pers5_prof==7 | full_panel$pers5_prof==8 | full_panel$pers5_prof==9, 1, 0)

# Add a number of ties variable that does not remove parents and or partners
full_panel$pers1_notna <- ifelse(!is.na(full_panel$pers1_know), 1, 0)
full_panel$pers2_notna <- ifelse(!is.na(full_panel$pers2_know), 1, 0)
full_panel$pers3_notna <- ifelse(!is.na(full_panel$pers3_know), 1, 0)
full_panel$pers4_notna <- ifelse(!is.na(full_panel$pers4_know), 1, 0)
full_panel$pers5_notna <- ifelse(!is.na(full_panel$pers5_know), 1, 0)
full_panel$number_of_ties_allnetw <- rowSums(full_panel[, c("pers1_notna", "pers2_notna", "pers3_notna", ## This is counting number of people that are knowin in general. 
                                                            "pers4_notna", "pers5_notna")],na.rm=T)

# Add the proportion of a network that has a given profession
full_panel$netprof_manager_prop <- rowSums(full_panel[, c("netprof_manager_pers1", "netprof_manager_pers2", "netprof_manager_pers3",
                                                          "netprof_manager_pers4", "netprof_manager_pers5")],na.rm=T) / full_panel$number_of_ties_allnetw
full_panel$netprof_think_prop <- rowSums(full_panel[, c("netprof_think_pers1", "netprof_think_pers2", "netprof_think_pers3",
                                                          "netprof_think_pers4", "netprof_think_pers5")],na.rm=T) / full_panel$number_of_ties_allnetw
full_panel$netprof_otwhitecol_prop <- rowSums(full_panel[, c("netprof_otwhitecol_pers1", "netprof_otwhitecol_pers2", "netprof_otwhitecol_pers3",
                                                          "netprof_otwhitecol_pers4", "netprof_otwhitecol_pers5")],na.rm=T) / full_panel$number_of_ties_allnetw
full_panel$netprof_manual_prop <- rowSums(full_panel[, c("netprof_manual_pers1", "netprof_manual_pers2", "netprof_manual_pers3",
                                                          "netprof_manual_pers4", "netprof_manual_pers5")],na.rm=T) / full_panel$number_of_ties_allnetw









# ################ Cross-sectional results ---------------------------------------
############# MODELS
x <- lm_robust(eu_progressive_high ~ sorting_number_ties + total_ties + nettoink_stand + gender + age10 + 
            urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
              netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data = subset(full_panel, ineducatio==0  & total_ties > 0 ), 
            cluster=nomem_encr)
x1 <- lm_robust(immigration_icw_outcome ~ sorting_number_ties + total_ties + nettoink_stand + gender + age10 + 
            urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
              netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data =  subset(full_panel, ineducatio==0  & total_ties > 0), 
            cluster=nomem_encr)
x2 <- lm_robust(vote_RRP ~ sorting_number_ties + total_ties + nettoink_stand + gender + age10 + 
            urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
              netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data =  subset(full_panel, ineducatio==0  & total_ties > 0), 
            cluster=nomem_encr)
x3 <- lm_robust(vote_prog ~ sorting_number_ties + total_ties + nettoink_stand + gender + age10 + 
            urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
              netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data =  subset(full_panel, ineducatio==0  & total_ties > 0), 
            cluster=nomem_encr)
# Added post R&R
x4 <- lm_robust(redis ~ sorting_number_ties + total_ties + nettoink_stand + gender + age10 + 
                  urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
                  netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data =  subset(full_panel, ineducatio==0  & total_ties > 0), 
                cluster=nomem_encr)




## Table for appendix model with control variables
model <- texreg(l=list(x, x1, x2, x3, x4), use.packages=F,
                stars=c(0.05, 0.01, 0.001),  
                digits=3, include.ci = FALSE, booktabs=T,
                custom.model.names = c("DV: EU", "Immigration", "Voting RRP", "Voting Progressive", "Redistribution"),
                omit.coef=c("(year)|(profession)"), 
                float.pos="htpb!",
                caption.above=T, single.row=T, fontsize="small",
                label="tab:main_liss_appendix",
                caption="LISS - Effect of network homogeneity on attitudes and voting",
       custom.coef.names=c("Intercept", "High edu, 1 lower-edu tie", "High edu, 2 lower-edu ties",
                           "High edu, 3 or more lower-edu ties", 
                           "Low edu, 3 or more higher-edu ties", "Low education, 2 higher-edu ties",
                           "Low edu, 1 higher-edu tie", "Low education, no higher-edu ties",
                           "Total ties", "Income (standardized)", "Women", "Age (by 10 years)", "Urban", "Migration background", "Employed", "Education (continuous)", "Political interest", "Parents(s) higher educated",
                           "Proportion of network in managing occup", "Proportion of network in analytical occup", 
                           "Prop of netwrk in other white-collar occup", "Proportion of network in manual occup"),
                threeparttable = TRUE,
       reorder.coef=c(2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,1),
                custom.gof.rows=list("Controls"= c("Yes", "Yes", "Yes","Yes","Yes"),
                                     "Year FE"= c("Yes", "Yes", "Yes","Yes","Yes")),
                custom.note="\\item %stars. The reference category are Higher educated respondents with no ties to lower educated people. Voting RRP is captured with a dummy variable that turns on if respondents would vote for the PVV, FvD, or LPF if the election were today. Voting progressive is a dummy variable that turns on if respondents would vote for the D66 or GL if the election were today. The EU variable is measured on a 5-step Likert scale and the immigration variable is an ICW index composed of three different variables, scaled from 0 to 1. Redistribution is measured on a 1-5 Likert. Controls include someone's profession (not shown).")

print(model, file = "LISS/tables/taba1_main_liss_appendix.tex")
rm(model)
# Table a1





######## Check the variance within sorting groups for each one of our outcomes
eu_var <- full_panel %>% filter(ineducatio==0  & total_ties > 0) %>%   dplyr::select(eu_progressive_high, sorting_number_ties, total_ties, nettoink_stand, gender, age10, 
                                                                    urban_dummy, migration_background, employed, education, pol_int, parents_HU_anyw, 
                                                                    netprof_manager_prop, netprof_think_prop, netprof_otwhitecol_prop, netprof_manual_prop, 
                                                                    yearf, profession_404_clean, ineducatio, nomem_encr) %>%
  drop_na() %>%
  group_by(sorting_number_ties) %>%
  summarize(sd = sd(eu_progressive_high)) 


immi_var <- full_panel %>% filter(ineducatio==0  & total_ties > 0) %>%  dplyr::select(immigration_icw_outcome, sorting_number_ties, total_ties, nettoink_stand, gender, age10, 
                                                                             urban_dummy, migration_background, employed, education, pol_int, parents_HU_anyw, 
                                                                             netprof_manager_prop, netprof_think_prop, netprof_otwhitecol_prop, netprof_manual_prop, 
                                                                             yearf, profession_404_clean, ineducatio, nomem_encr) %>%
  drop_na() %>%
  group_by(sorting_number_ties) %>%
  summarize(sd = sd(immigration_icw_outcome)) 

rrp_var <- full_panel %>% filter(ineducatio==0  & total_ties > 0) %>%  dplyr::select(vote_RRP, sorting_number_ties, total_ties, nettoink_stand, gender, age10, 
                                                                             urban_dummy, migration_background, employed, education, pol_int, parents_HU_anyw, 
                                                                             netprof_manager_prop, netprof_think_prop, netprof_otwhitecol_prop, netprof_manual_prop, 
                                                                             yearf, profession_404_clean, ineducatio, nomem_encr) %>%
  drop_na() %>%
  group_by(sorting_number_ties) %>%
  summarize(sd = sd(vote_RRP)) 

voteprog_var <- full_panel %>% filter(ineducatio==0  & total_ties > 0) %>%  dplyr::select(vote_prog, sorting_number_ties, total_ties, nettoink_stand, gender, age10, 
                                                                             urban_dummy, migration_background, employed, education, pol_int, parents_HU_anyw, 
                                                                             netprof_manager_prop, netprof_think_prop, netprof_otwhitecol_prop, netprof_manual_prop, 
                                                                             yearf, profession_404_clean, ineducatio, nomem_encr) %>%
  drop_na() %>%
  group_by(sorting_number_ties) %>%
  summarize(sd = sd(vote_prog)) 











############# For each outcome, a graph with predicted probabilities (reported in the paper)

## Means for the EU var
x <- lm(eu_progressive_high ~ sorting_number_ties + total_ties + nettoink_stand + gender + age10 + 
                 urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
          netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data = subset(full_panel, ineducatio==0  & total_ties > 0))
eu_means <- ggemmeans(x, terms=c("sorting_number_ties"), vcov.fun="vcovCL", vcov.args="cluster = ~nomem_encr")
x <- lm(eu_progressive_high ~ higher_edu_atten + total_ties + nettoink_stand + gender + age10 + 
          urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
          netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data = subset(full_panel, ineducatio==0  & total_ties > 0))
eu_group_means <- ggemmeans(x, terms=c("higher_edu_atten"), vcov.fun="vcovCL", vcov.args="cluster = ~nomem_encr")

# Add an education grouping variable
eu_means$education <- ifelse(grepl("_1", eu_means$x), "Higher Educated", "Lower Educated")

# Get the percentages of each sorting group for the labels
perc <- full_panel %>% ungroup() %>% mutate(n_total = n()) %>% group_by(sorting_number_ties) %>% mutate(n_group = n()) %>%
  dplyr::select(n_total, n_group) %>%  distinct(sorting_number_ties, .keep_all = T)
perc$percentage <- round(perc$n_group / perc$n_total * 100, 0)


# Set the levels and labels. 
eu_means$x <- factor(eu_means$x, levels = c("0__0", "1__0", "2__0", "3 or more ties__0", "3 or more ties__1", "2__1", "1__1", "0__1"),
                  labels = c("Low 0 ties (42%)", "Low 1 tie (13%)", "Low 2 ties (7%)",
                             "Low >2 ties (5%)", "High >2 ties (4%)", "High 2 ties (5%)",
                             "High 1 tie (8%)", "High 0 ties (16%)"))

eu_means$mean_eu <- eu_group_means$predicted[2]
eu_means$mean_eu[eu_means$education=="Lower Educated"] <- eu_group_means$predicted[1]

plot_pred_eu <- ggplot(eu_means, 
            aes(x=predicted, y=x, xmin=conf.low, xmax=conf.high, color=education, shape=education)) + 
  geom_pointrange(size=0.8) + 
  labs(y="", x="Predicted position on EU scale",
       color="Education or respondents:", shape="Education or respondents:",
       title="") +
  scale_color_grey(end=0.6) + theme(legend.position = "none") +
  geom_point(aes(x=mean_eu, y=x, color=education), shape=108, size=7, alpha = 0.6)




### Immigration
## Means for the EU var
x <- lm(immigration_icw_outcome ~ sorting_number_ties + total_ties + nettoink_stand + gender + age10 + 
          urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
          netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data = subset(full_panel, ineducatio==0  & total_ties > 0))
immi_means <- ggemmeans(x, terms=c("sorting_number_ties"), vcov.fun="vcovCL", vcov.args="cluster = ~nomem_encr")
x <- lm(immigration_icw_outcome ~ higher_edu_atten + total_ties + nettoink_stand + gender + age10 + 
          urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
          netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data = subset(full_panel, ineducatio==0  & total_ties > 0))
immi_group_means <- ggemmeans(x, terms=c("higher_edu_atten"), vcov.fun="vcovCL", vcov.args="cluster = ~nomem_encr")

# Add an education grouping variable
immi_means$education <- ifelse(grepl("_1", immi_means$x), "Higher Educated", "Lower Educated")

# Set the levels and labels. 
immi_means$x <- factor(immi_means$x, levels = c("0__0", "1__0", "2__0", "3 or more ties__0", "3 or more ties__1", "2__1", "1__1", "0__1"),
                     labels = c("Low 0 ties (42%)", "Low 1 tie (13%)", "Low 2 ties (7%)",
                                "Low >2 ties (5%)", "High >2 ties (4%)", "High 2 ties (5%)",
                                "High 1 tie (8%)", "High 0 ties (16%)"))

immi_means$mean_immi <- immi_group_means$predicted[2]
immi_means$mean_immi[immi_means$education=="Lower Educated"] <- immi_group_means$predicted[1]

plot_pred_immi <- ggplot(immi_means, 
       aes(x=predicted, y=x, xmin=conf.low, xmax=conf.high, color=education, shape=education)) + 
  geom_pointrange(size=0.8) + 
  labs(y="", x="Predicted position on immigration scale",
       color="Education or respondents:", shape="Education or respondents:",
       title="") +
  scale_color_grey(end=0.6) + theme(legend.position = "none") +
  geom_point(aes(x=mean_immi, y=x, color=education), shape=108, size=7, alpha = 0.6)






######## RRP
x <- lm(vote_RRP ~ sorting_number_ties + total_ties + nettoink_stand + gender + age10 + 
          urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
          netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data = subset(full_panel, ineducatio==0  & total_ties > 0))
rrp_means <- ggemmeans(x, terms=c("sorting_number_ties"), vcov.fun="vcovCL", vcov.args="cluster = ~nomem_encr")
x <- lm(vote_RRP ~ higher_edu_atten + total_ties + nettoink_stand + gender + age10 + 
          urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw +
          netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data = subset(full_panel, ineducatio==0  & total_ties > 0))
rrp_group_means <- ggemmeans(x, terms=c("higher_edu_atten"), vcov.fun="vcovCL", vcov.args="cluster = ~nomem_encr")

# Add an education grouping variable
rrp_means$education <- ifelse(grepl("_1", rrp_means$x), "Higher Educated", "Lower Educated")

# Set the levels and labels. 
rrp_means$x <- factor(rrp_means$x, levels = c("0__0", "1__0", "2__0", "3 or more ties__0", "3 or more ties__1", "2__1", "1__1", "0__1"),
                       labels = c("Low 0 ties (42%)", "Low 1 tie (13%)", "Low 2 ties (7%)",
                                  "Low >2 ties (5%)", "High >2 ties (4%)", "High 2 ties (5%)",
                                  "High 1 tie (8%)", "High 0 ties (16%)"))

rrp_means$mean_rrp <- rrp_group_means$predicted[2]
rrp_means$mean_rrp[rrp_means$education=="Lower Educated"] <- rrp_group_means$predicted[1]

plot_pred_rrp <- ggplot(rrp_means, 
                         aes(x=predicted, y=x, xmin=conf.low, xmax=conf.high, color=education, shape=education)) + 
  geom_pointrange(size=0.8) + 
  labs(y="", x="Predicted probability to vote RRP",
       color="Education or respondents:", shape="Education or respondents:",
       title="") +
  scale_color_grey(end=0.6) + theme(legend.position = "none", axis.text.y = element_blank()) +
  geom_point(aes(x=mean_rrp, y=x, color=education), shape=108, size=7, alpha = 0.6)






############ Progressive
x <- lm(vote_prog ~ sorting_number_ties + total_ties + nettoink_stand + gender + age10 + 
          urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
          netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data = subset(full_panel, ineducatio==0  & total_ties > 0))
prog_means <- ggemmeans(x, terms=c("sorting_number_ties"), vcov.fun="vcovCL", vcov.args="cluster = ~nomem_encr")
x <- lm(vote_prog ~ higher_edu_atten + total_ties + nettoink_stand + gender + age10 + 
          urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
          netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data = subset(full_panel, ineducatio==0  & total_ties > 0))
prog_group_means <- ggemmeans(x, terms=c("higher_edu_atten"), vcov.fun="vcovCL", vcov.args="cluster = ~nomem_encr")

# Add an education grouping variable
prog_means$education <- ifelse(grepl("_1", prog_means$x), "Higher Educated", "Lower Educated")

# Set the levels and labels. 
prog_means$x <- factor(prog_means$x, levels = c("0__0", "1__0", "2__0", "3 or more ties__0", "3 or more ties__1", "2__1", "1__1", "0__1"),
                      labels = c("Low 0 ties (42%)", "Low 1 tie (13%)", "Low 2 ties (7%)",
                                 "Low >2 ties (5%)", "High >2 ties (4%)", "High 2 ties (5%)",
                                 "High 1 tie (8%)", "High 0 ties (16%)"))

prog_means$mean_prog <- prog_group_means$predicted[2]
prog_means$mean_prog[prog_means$education=="Lower Educated"] <- prog_group_means$predicted[1]

plot_pred_prog <- ggplot(prog_means, 
                        aes(x=predicted, y=x, xmin=conf.low, xmax=conf.high, color=education, shape=education)) + 
  geom_pointrange(size=0.8) + 
  labs(y="", x="Predicted probability to vote progressive",
       color="Education or respondents:", shape="Education or respondents:",
       title="") +
  scale_color_grey(end=0.6) + theme(legend.position = "none", axis.text.y = element_blank()) +
  geom_point(aes(x=mean_prog, y=x, color=education), shape=108, size=7, alpha = 0.6)



## Save the plots
plots <- grid.arrange(plot_pred_eu, plot_pred_prog, plot_pred_immi, plot_pred_rrp, nrow = 2, ncol=2)
plots
ggsave(filename="LISS/plots/fig2_predprob_plots.png", plot=plots, dpi=600, width=9, height=6)
# Figure 2 all ties










############# Appendix: main models without partners for each respondent
#sorting parents_HU_anyw parents_HU
# sorting_number_ties_friends

x <- lm_robust(eu_progressive_high ~ sorting_number_ties_friends + total_ties_friends + nettoink_stand + gender + age10 + 
                 urban_dummy + migration_background + employed + parents_HU_anyw + education + pol_int +  
                 netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + as.factor(year) + profession_404_clean, 
               data = subset(full_panel, ineducatio==0  & total_ties_friends > 0), 
               cluster=nomem_encr)
x1 <- lm_robust(immigration_icw_outcome ~ sorting_number_ties_friends + total_ties_friends + nettoink_stand + gender + age10 + 
                  urban_dummy + migration_background + employed + parents_HU_anyw + education + pol_int + 
                  netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + as.factor(year) + profession_404_clean, 
                data =  subset(full_panel, ineducatio==0  & total_ties_friends > 0), 
                cluster=nomem_encr)
x2 <- lm_robust(vote_RRP ~ sorting_number_ties_friends + total_ties_friends + nettoink_stand + gender + age10 + 
                  urban_dummy + migration_background + employed + parents_HU_anyw + education + pol_int + 
                  netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + as.factor(year) + profession_404_clean, 
                data =  subset(full_panel, ineducatio==0  & total_ties_friends > 0), 
                cluster=nomem_encr)
x3 <- lm_robust(vote_prog ~ sorting_number_ties_friends + total_ties_friends + nettoink_stand + gender + age10 + 
                  urban_dummy + migration_background + employed + parents_HU_anyw + education + pol_int + 
                  netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + as.factor(year) + profession_404_clean, 
                data =  subset(full_panel, ineducatio==0  & total_ties_friends > 0), 
                cluster=nomem_encr)
x4 <- lm_robust(redis ~ sorting_number_ties_friends + total_ties_friends + nettoink_stand + gender + age10 + 
                  urban_dummy + migration_background + employed + parents_HU_anyw + education + pol_int + 
                  netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + as.factor(year) + profession_404_clean, 
                data =  subset(full_panel, ineducatio==0  & total_ties_friends > 0), 
                cluster=nomem_encr)





## Appendix model with only friends
model <- texreg(l=list(x, x1, x2, x3, x4), use.packages=F,
                stars=c(0.05, 0.01, 0.001),  
                digits=3, include.ci = FALSE, booktabs=T,  single.row=T, fontsize="small",
                custom.model.names = c("DV: EU", "Immigration", "Voting RRP", "Voting Progressive", "Redistribution"),
                omit.coef=c("(year)|(profession)"), 
                caption.above=T,
                label="main_liss_appendix_contpartner",
                caption="LISS - Effect of network homogeneity on attitudes and voting. Without partners.",
                custom.coef.names=c("Intercept", "High edu, 1 lower-educated tie", "High edu, 2 lower-educated ties",
                                    "High edu, 3 or more lower-edu ties", 
                                    "Low edu, 3 or more higher-edu ties", "Low edu, 2 higher-educated ties",
                                    "Low edu, 1 higher-educated tie", "Low edu, no higher-educated ties",
                                    "Total ties (without parents)", "Income (standardized)", "Women", "Age (by 10 years)", "Urban", "Migration background", "Employed", "Parent(s) higher educated", "Education (continuous)", "Political interest",
                                    "Proportion of network in managing occupation", "Proportion of network in analytical occupation", 
                                    "Prop of netw in other white-collar occupation", "Proportion of network in manual occupation"),
                threeparttable = TRUE,
                float.pos="htpb!",
                reorder.coef=c(2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,1),
                custom.gof.rows=list("Controls"= c("Yes", "Yes", "Yes","Yes","Yes"),
                                     "Year FE"= c("Yes", "Yes", "Yes","Yes","Yes")),
                custom.note="\\item %stars. The reference category are Higher educated respondents with no ties to lower educated people. Voting RRP is captured with a dummy variable that turns on if respondents would vote for the PVV, FvD, or LPF if the election were today. Voting progressive is a dummy variable that turns on if respondents would vote for the D66 or GL if the election were today. The EU variable is measured on a 5-step Likert scale and the immigration variable is an ICW index composed of three different variables, scaled from 0 to 1. Redistribution is measured on a 1-5 Likert. Controls include someone's profession (not shown).")

print(model, file = "LISS/tables/taba5_main_liss_appendix_contpartner.tex")
rm(model)
# Table a5













############# Appendix: main model with one person per household

############# MODELS
full_panel_oneperhous <- full_panel  %>% group_by(year) %>% distinct(nohouse_encr, .keep_all=T)

x <- lm_robust(eu_progressive_high ~ sorting_number_ties + total_ties + nettoink_stand + gender + age10 + 
                 urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
                 netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data = subset(full_panel_oneperhous, ineducatio==0  & total_ties > 0 ), 
               cluster=nomem_encr)
x1 <- lm_robust(immigration_icw_outcome ~ sorting_number_ties + total_ties + nettoink_stand + gender + age10 + 
                  urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
                  netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data =  subset(full_panel_oneperhous, ineducatio==0  & total_ties > 0), 
                cluster=nomem_encr)
x2 <- lm_robust(vote_RRP ~ sorting_number_ties + total_ties + nettoink_stand + gender + age10 + 
                  urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
                  netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data =  subset(full_panel_oneperhous, ineducatio==0  & total_ties > 0), 
                cluster=nomem_encr)
x3 <- lm_robust(vote_prog ~ sorting_number_ties + total_ties + nettoink_stand + gender + age10 + 
                  urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
                  netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data =  subset(full_panel_oneperhous, ineducatio==0  & total_ties > 0), 
                cluster=nomem_encr)
# Added post R&R
x4 <- lm_robust(redis ~ sorting_number_ties + total_ties + nettoink_stand + gender + age10 + 
                  urban_dummy + migration_background + employed + education + pol_int + parents_HU_anyw + 
                  netprof_manager_prop + netprof_think_prop + netprof_otwhitecol_prop + netprof_manual_prop + yearf + profession_404_clean, data =  subset(full_panel_oneperhous, ineducatio==0  & total_ties > 0), 
                cluster=nomem_encr)




## Table for appendix model with control variables
model <- texreg(l=list(x, x1, x2, x3, x4), use.packages=F,
                stars=c(0.05, 0.01, 0.001),  
                digits=3, include.ci = FALSE, booktabs=T,
                custom.model.names = c("DV: EU", "Immigration", "Voting RRP", "Voting Progressive", "Redistribution"),
                omit.coef=c("(year)|(profession)"), 
                float.pos="htpb!",
                caption.above=T, single.row=T, fontsize="small",
                label="tab:main_liss_appendix_oneperhouse",
                caption="LISS - Effect of network homogeneity on attitudes and voting. One person per household",
                custom.coef.names=c("Intercept", "High edu, 1 lower-edu tie", "High edu, 2 lower-edu ties",
                                    "High edu, 3 or more lower-edu ties", 
                                    "Low edu, 3 or more higher-edu ties", "Low education, 2 higher-edu ties",
                                    "Low edu, 1 higher-edu tie", "Low education, no higher-edu ties",
                                    "Total ties", "Income (standardized)", "Women", "Age (by 10 years)", "Urban", "Migration background", "Employed", "Education (continuous)", "Political interest", "Parents(s) higher educated",
                                    "Proportion of network in managing occup", "Proportion of network in analytical occup", 
                                    "Prop of netwrk in other white-collar occup", "Proportion of network in manual occup"),
                threeparttable = TRUE,
                reorder.coef=c(2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,1),
                custom.gof.rows=list("Controls"= c("Yes", "Yes", "Yes","Yes","Yes"),
                                     "Year FE"= c("Yes", "Yes", "Yes","Yes","Yes")),
                custom.note="\\item %stars. The reference category are Higher educated respondents with no ties to lower educated people. Voting RRP is captured with a dummy variable that turns on if respondents would vote for the PVV, FvD, or LPF if the election were today. Voting progressive is a dummy variable that turns on if respondents would vote for the D66 or GL if the election were today. The EU variable is measured on a 5-step Likert scale and the immigration variable is an ICW index composed of three different variables, scaled from 0 to 1. Redistribution is measured on a 1-5 Likert. Controls include someone's profession (not shown).")

print(model, file = "LISS/tables/taba6_main_liss_appendix_oneperhouse.tex")
rm(model)
# Table A6




















# ################ Social Influence ---------------------------------------


## Main model function (both education groups together, controlling for attending high education)
model_social_infl_bothedu <- function(outcome, treatment){
  formula <- as.formula(paste0(outcome, "~", treatment, "+ higher_edu_atten + nettoink_stand + total_ties + age10"))
  hold1 <- plm(formula= formula,
               data=subset(full_panel, total_ties > 0 & age >18),
               index = c("nomem_encr", "year"),
               model="within", effect="twoway")
  people <- pdim(hold1)$nT$n
  obs <- pdim(hold1)$nT$N
  hold1 <- coeftest(hold1, vcov=function(x) 
    vcovHC(x, cluster="group", type="HC1"))
  results <- list(hold1, people, obs)
  return(results)
}


eu <- model_social_infl_bothedu("eu_progressive_high", "number_high_ties")
immi <- model_social_infl_bothedu("immigration_icw_outcome", "number_high_ties")
rrp <- model_social_infl_bothedu("term_pvv","number_high_ties")
gl <- model_social_infl_bothedu("term_d66gl","number_high_ties")
redis <- model_social_infl_bothedu("redis","number_high_ties")

model <- texreg(l=list(eu[[1]], immi[[1]], rrp[[1]], gl[[1]]), use.packages=F,
                stars=c(0.10, 0.05, 0.01),
                digits=3, include.ci = FALSE, booktabs=T, threeparttable=T,
                custom.model.names = c("DV: EU", "Immigration (ICW)", "RRP Therm", "Prog Therm"),
                custom.coef.names=c("Number of high ties", "Attending higher edu", "Income (standardized)", "Total ties", "Age (in 10 years)"),
                caption.above=T,
                float.pos="htpb!",
                label="main_liss_sorting",
                caption="LISS - The effect of a change in the education level of a tie on the change in attitudes and voting",
                custom.gof.rows=list("Individual and time FE"= c("Yes", "Yes", "Yes", "Yes"),
                                     "Num. obs" =c(eu[[3]], immi[[3]], rrp[[3]], gl[[3]]),
                                     "Num. Respondents" =c(eu[[2]], immi[[2]], rrp[[2]], gl[[2]])),
                custom.note="\\item %stars.  The table shows that a change in the number of higher-educated ties someone has influences political attitudes and behavior. Two-way fixed effects models with standard errors clustered at the respondent level. The outcome variables are a 5-step Likert scale for EU, an ICW index for immigration (0-1), and 11-step thermostat scales for the PVV and GL + D66.")

model
print(model, file = "LISS/tables/tab3_main_liss_sorting.tex")





model <- texreg(l=list(redis[[1]]), use.packages=F,
                stars=c(0.10, 0.05, 0.01),
                digits=3, include.ci = FALSE, booktabs=T, threeparttable=T,
                custom.model.names = c("DV: Redistribution"),
                custom.coef.names=c("Number of high ties", "Attending higher edu", "Income (standardized)", "Total ties", "Age (in 10 years)"),
                caption.above=T,
                float.pos="htpb!",
                label="main_liss_sorting_redis",
                caption="LISS - The effect of a change in the education level of a tie on the change in redistribution preferences",
                custom.gof.rows=list("Individual and time FE"= c("Yes"),
                                     "Num. obs" =c(redis[[3]]),
                                     "Num. Respondents" =c(redis[[2]])),
                custom.note="\\item %stars.  The table shows that a change in the number of higher-educated ties someone has influences redistribution preferences (1-5 scale).")


print(model, file = "LISS/tables/taba16_main_liss_sorting_redis.tex")










## and with binary ties both grouped together
eu <- model_social_infl_bothedu("eu_progressive_high", "has_highties")
immi <- model_social_infl_bothedu("immigration_icw_outcome", "has_highties")
rrp <- model_social_infl_bothedu("term_pvv","has_highties")
gl <- model_social_infl_bothedu("term_d66gl","has_highties")
redis <- model_social_infl_bothedu("redis","has_highties")

model <- texreg(l=list(eu[[1]], immi[[1]], rrp[[1]], gl[[1]], redis[[1]]), use.packages=F,
                stars=c(0.10, 0.05, 0.01),
                digits=3, include.ci = FALSE, booktabs=T, threeparttable=T,
                custom.model.names = c("DV: EU", "Immigration (ICW)", "RRP Therm", "Prog Therm", "Redistribution"),
                custom.coef.names=c("Having high ties (0-1)", "Attending higher edu", "Income (standardized)", "Total ties", "Age (in 10 years)"),
                caption.above=T,
                float.pos="htpb!", fontsize="scriptsize",
                label="main_liss_sorting_binary",
                caption="LISS - The effect of a change in the education level of a tie on the change in attitudes and voting",
                custom.gof.rows=list("Individual and time FE"= c("Yes", "Yes", "Yes", "Yes", "Yes"),
                                     "Num. obs" =c(eu[[3]], immi[[3]], rrp[[3]], gl[[3]], redis[[3]]),
                                     "Num. Respondents" =c(eu[[2]], immi[[2]], rrp[[2]], gl[[2]], redis[[2]])),
                custom.note="\\item %stars.  The table shows that a change whether someone has higher-educated ties someone has influences political attitudes and behavior. Two-way fixed effects models with standard errors clustered at the respondent level. The outcome variables are a 5-step Likert scale for EU and redistribution, an ICW index for immigration (0-1), and 11-step thermostat scales for the PVV and GL + D66.")

print(model, file = "LISS/tables/taba11_main_liss_sorting_binary.tex")
#table a11




#### Check within-individual effects of ties on each of the immigration variables 
immi1 <- model_social_infl_bothedu("immi_for_many_pro", "number_high_ties")
immi2 <- model_social_infl_bothedu("asylum_eas_rev", "number_high_ties")
immi3 <- model_social_infl_bothedu("diver_cultures_rev", "number_high_ties")

model <- texreg(l=list(immi1[[1]], immi2[[1]], immi3[[1]]), use.packages=F,
                stars=c(0.10, 0.05, 0.01),
                digits=3, include.ci = FALSE, booktabs=T, threeparttable=T,
                custom.model.names = c("DV: Pro immigrants", "Pro asylum seekers", "Pro diverse cultures"),
                custom.coef.names=c("Change in number of high ties", "Attending higher education", "Income (standardized)", "Total ties", "Age (in 10 years)"),
                float.pos="htpb!",
                caption.above=T,
                label="main_liss_sorting_immivars",
                caption="LISS - The effect of a change in the education level of a tie on each of the immigration variables",
                custom.gof.rows=list("Individual and time FE"= c("Yes", "Yes", "Yes"),
                                     "Num. obs" =c(immi1[[3]], immi2[[3]], immi3[[3]]),
                                     "Num. Respondents" =c(immi1[[2]], immi2[[2]], immi3[[2]])),
                custom.note="\\item %stars.  The table shows that a change in the number of higher-educated ties someone has influences immigration attitudes. Two-way fixed effects models with standard errors clustered at the respondent level.")

model
print(model, file = "LISS/tables/taba10_main_liss_sorting_immivars.tex")
# Table a10










#### For the main models, run them with one person per household
full_panel_oneperhous <- full_panel  %>% group_by(year) %>% distinct(nohouse_encr, .keep_all=T)

model_social_infl_bothedu <- function(outcome, treatment){
  formula <- as.formula(paste0(outcome, "~", treatment, "+ higher_edu_atten + nettoink_stand + total_ties + age10"))
  hold1 <- plm(formula= formula,
               data=subset(full_panel_oneperhous, total_ties > 0 & age >18),
               index = c("nomem_encr", "year"),
               model="within", effect="twoway")
  people <- pdim(hold1)$nT$n
  obs <- pdim(hold1)$nT$N
  hold1 <- coeftest(hold1, vcov=function(x) 
    vcovHC(x, cluster="group", type="HC1"))
  results <- list(hold1, people, obs)
  return(results)
}


eu <- model_social_infl_bothedu("eu_progressive_high", "number_high_ties")
immi <- model_social_infl_bothedu("immigration_icw_outcome", "number_high_ties")
rrp <- model_social_infl_bothedu("term_pvv","number_high_ties")
gl <- model_social_infl_bothedu("term_d66gl","number_high_ties")

redis <- model_social_infl_bothedu("redis","number_high_ties")

model <- texreg(l=list(eu[[1]], immi[[1]], rrp[[1]], gl[[1]], redis[[1]]), use.packages=F,
                stars=c(0.10, 0.05, 0.01),
                digits=3, include.ci = FALSE, booktabs=T, threeparttable=T,
                custom.model.names = c("DV: EU", "Immigration (ICW)", "RRP Therm", "Prog Therm", "Redistribution"),
                custom.coef.names=c("Number of high ties", "Attending higher edu", "Income (standardized)", "Total ties", "Age (in 10 years)"),
                caption.above=T,
                float.pos="htpb!",
                label="main_liss_sorting_oneperhh",
                caption="LISS - The effect of a change in the education level of a tie on the change in attitudes and voting. One person per household",
                custom.gof.rows=list("Individual and time FE"= c("Yes", "Yes", "Yes", "Yes", "Yes"),
                                     "Num. obs" =c(eu[[3]], immi[[3]], rrp[[3]], gl[[3]], redis[[3]]),
                                     "Num. Respondents" =c(eu[[2]], immi[[2]], rrp[[2]], gl[[2]], redis[[2]])),
                custom.note="\\item %stars.  In these models we only include one person per household. The table shows that a change in the number of higher-educated ties someone has influences political attitudes and behavior. Two-way fixed effects models with standard errors clustered at the respondent level. The outcome variables are a 5-step Likert scale for EU and redistribution, an ICW index for immigration (0-1), and 11-step thermostat scales for the PVV and GL + D66.")

model
print(model, file = "LISS/tables/taba8_main_liss_sorting_oneperhh.tex")
# Table a8










############ For binary treatment, do an auror plot
## Do lags for binary treatment for auror plot

full_panel <- full_panel %>% group_by(nomem_encr) %>% 
  mutate(has_highties_lead_1 = dplyr::lead(has_highties, n = 1L),
         has_highties_lead_2 = dplyr::lead(has_highties, n = 2L)) %>%
  ungroup()


auror_plot <- function(outcome, ylabel){
  formula <- as.formula(paste(outcome, "~", "has_highties + has_highties_lead_1 + 
                           has_highties_lead_2  +
                            + higher_edu_atten + nettoink_stand + total_ties + age10"))
  
  holdem <- plm(formula=formula,
                data=subset(full_panel, total_ties > 0 & age >18),
                index = c("nomem_encr", "year"),
                model="within", effect="twoway") 
  holdem <- coeftest(holdem, vcov=function(x) 
    vcovHC(x, cluster="group", type="HC1"))
  point <- (holdem[1:3,1])
  lower <- (holdem[1:3,1] - 1.96*holdem[1:3,2]) 
  upper <- (holdem[1:3,1] + 1.96*holdem[1:3,2]) 
  yba <- c(0, -1,-2)
  plotdat <- data.frame(point=point,upper=upper, lower=lower, yba=yba)
  
  
  plot <- ggplot(plotdat, aes(x=yba,y=point, ymin=lower, ymax=upper)) + 
    geom_pointrange(position = position_dodge2(width=0.45), size=1, linewidth=1) +
    labs(x="Years before high-tie treatment", y=paste(ylabel)) +
    geom_hline(yintercept=0, linetype="dashed") + scale_x_continuous(breaks=c(-2, -1, 0), labels=c("-2", "-1", "0")) +
    geom_vline(xintercept=-0.08, linetype="solid", color="grey", linewidth=1.2)
  return(plot)
  
}


auror_eu <- auror_plot("eu_progressive_high", "Change in EU attitudes")
auror_immi <- auror_plot("immigration_icw_outcome", "Change in immigration attitudes")
auror_rrp <- auror_plot("term_pvv", "Change in PVV thermostat")
auror_gl <- auror_plot("term_d66gl", "Change in D66 and GL thermostat")
auror_redis <- auror_plot("redis", "Change in redistribution attitudes")

# Combined auror plots
plots <- grid.arrange(auror_eu, auror_gl, auror_immi, auror_rrp, auror_redis, nrow = 3, ncol=2)
ggsave(filename="LISS/plots/figa2_auror_plots.png", plot=plots, dpi=600, width=8, height=9)











############ For continues treatment, check whether the effects differ for gaining and losing treatment status
full_panel <- full_panel %>% group_by(nomem_encr) %>%   arrange(nomem_encr, year) %>%
  mutate(number_high_ties_lead_1 = dplyr::lead(number_high_ties, n = 1L),
         number_high_ties_lead_2 = dplyr::lead(number_high_ties, n = 2L),
         number_high_ties_lag_1 = dplyr::lag(number_high_ties, n = 1L),
         number_high_ties_lag_2 = dplyr::lag(number_high_ties, n = 2L)) %>%
  ungroup()

### First only the effect for those who gain highties
# Add a variable, if the current value is 0 and the last one was 1, then you should be dropped henceforth
full_panel$fromherereverse <- NA
full_panel$fromherereverse[full_panel$number_high_ties < full_panel$number_high_ties_lag_1] <- 1

full_panel <- full_panel %>%
  group_by(nomem_encr) %>%
  tidyr::fill(fromherereverse, .direction = "down")

full_panel_onlygains  <- full_panel %>% filter(is.na(fromherereverse))


# change the function for these dataframes and run the models
model_social_infl_bothedu_difdata <- function(outcome, treatment, data){
  formula <- as.formula(paste0(outcome, "~", treatment, "+ higher_edu_atten + nettoink_stand + total_ties + age10"))
  hold1 <- plm(formula= formula,
               data=subset(data, total_ties > 0 & age >18),
               index = c("nomem_encr", "year"),
               model="within", effect="twoway")
  people <- pdim(hold1)$nT$n
  obs <- pdim(hold1)$nT$N
  hold1 <- coeftest(hold1, vcov=function(x) 
    vcovHC(x, cluster="group", type="HC1"))
  results <- list(hold1, people, obs)
  return(results)
}


eu <- model_social_infl_bothedu_difdata("eu_progressive_high", "number_high_ties", full_panel_onlygains)
immi <- model_social_infl_bothedu_difdata("immigration_icw_outcome", "number_high_ties", full_panel_onlygains)
rrp <- model_social_infl_bothedu_difdata("term_pvv","number_high_ties", full_panel_onlygains)
gl <- model_social_infl_bothedu_difdata("term_d66gl","number_high_ties", full_panel_onlygains)

model <- texreg(l=list(eu[[1]], immi[[1]], rrp[[1]], gl[[1]]), use.packages=F,
                stars=c(0.10, 0.05, 0.01),
                digits=3, include.ci = FALSE, booktabs=T, threeparttable=T,
                custom.model.names = c("DV: EU", "Immigration (ICW)", "RRP Therm", "Prog Therm"),
                custom.coef.names=c("Number of high ties", "Attending higher edu", "Income (standardized)", "Total ties", "Age (in 10 years)"),
                caption.above=T,
                float.pos="htpb!", fontsize="scriptsize",
                label="main_liss_sorting_number_gainingtie",
                caption="LISS - The effect of a change in the education level of a tie on the change in attitudes and voting. Only treatment changes where respondents gain ties",
                custom.gof.rows=list("Individual and time FE"= c("Yes", "Yes", "Yes", "Yes"),
                                     "Num. obs" =c(eu[[3]], immi[[3]], rrp[[3]], gl[[3]]),
                                     "Num. Respondents" =c(eu[[2]], immi[[2]], rrp[[2]], gl[[2]])),
                custom.note="\\item %stars. Two-way fixed effects models with standard errors clustered at the respondent level. The outcome variables are a 5-step Likert scale for EU, an ICW index for immigration (0-1), and 11-step thermostat scales for the PVV and GL + D66.")
model
print(model, file = "LISS/tables/taba15_main_liss_sorting_number_gainingtie.tex")
# Table a15



### And for people who lose a high tie
# if 0 and the next is 1 then drop
full_panel$fromheregains <- NA
full_panel$fromheregains[full_panel$number_high_ties > full_panel$number_high_ties_lag_1] <- 1
full_panel <- full_panel %>%
  group_by(nomem_encr) %>%
  tidyr::fill(fromheregains, .direction = "down")

full_panel_onlyloses  <- full_panel %>% filter(is.na(fromheregains))

eu <- model_social_infl_bothedu_difdata("eu_progressive_high", "number_high_ties", full_panel_onlyloses)
immi <- model_social_infl_bothedu_difdata("immigration_icw_outcome", "number_high_ties", full_panel_onlyloses)
rrp <- model_social_infl_bothedu_difdata("term_pvv","number_high_ties", full_panel_onlyloses)
gl <- model_social_infl_bothedu_difdata("term_d66gl","number_high_ties", full_panel_onlyloses)

model <- texreg(l=list(eu[[1]], immi[[1]], rrp[[1]], gl[[1]]), use.packages=F,
                stars=c(0.10, 0.05, 0.01),
                digits=3, include.ci = FALSE, booktabs=T, threeparttable=T,
                custom.model.names = c("DV: EU", "Immigration (ICW)", "RRP Therm", "Prog Therm"),
                custom.coef.names=c("Number of high ties", "Attending higher edu", "Income (standardized)", "Total ties", "Age (in 10 years)"),
                caption.above=T,
                float.pos="htpb!", fontsize="scriptsize",
                label="main_liss_sorting_number_losingtie",
                caption="LISS - The effect of a change in the education level of a tie on the change in attitudes and voting. Only treatment changes where respondents lose ties",
                custom.gof.rows=list("Individual and time FE"= c("Yes", "Yes", "Yes", "Yes"),
                                     "Num. obs" =c(eu[[3]], immi[[3]], rrp[[3]], gl[[3]]),
                                     "Num. Respondents" =c(eu[[2]], immi[[2]], rrp[[2]], gl[[2]])),
                custom.note="\\item %stars. Two-way fixed effects models with standard errors clustered at the respondent level. The outcome variables are a 5-step Likert scale for EU, an ICW index for immigration (0-1), and 11-step thermostat scales for the PVV and GL + D66.")
model
print(model, file = "LISS/tables/taba14_main_liss_sorting_number_losingtie.tex")
# Table a14
























########## Check the main model without parents in the coding
model_social_infl_bothedu <- function(outcome, treatment){
  formula <- as.formula(paste0(outcome, "~", treatment, "+ higher_edu_atten + nettoink_stand + total_ties + age10"))
  hold1 <- plm(formula= formula,
               data=subset(full_panel, total_ties > 0 & age >18),
               index = c("nomem_encr", "year"),
               model="within", effect="twoway")
  people <- pdim(hold1)$nT$n
  obs <- pdim(hold1)$nT$N
  hold1 <- coeftest(hold1, vcov=function(x) 
    vcovHC(x, cluster="group", type="HC1"))
  results <- list(hold1, people, obs)
  return(results)
}


eu <- model_social_infl_bothedu("eu_progressive_high", "number_high_ties_friends")
immi <- model_social_infl_bothedu("immigration_icw_outcome", "number_high_ties_friends")
rrp <- model_social_infl_bothedu("term_pvv","number_high_ties_friends")
gl <- model_social_infl_bothedu("term_d66gl","number_high_ties_friends")
redis <- model_social_infl_bothedu("redis","number_high_ties_friends")

model <- texreg(l=list(eu[[1]], immi[[1]], rrp[[1]], gl[[1]], redis[[1]]), use.packages=F,
                stars=c(0.10, 0.05, 0.01),
                digits=3, include.ci = FALSE, booktabs=T, threeparttable=T, fontsize="small",
                custom.model.names = c("DV: EU", "Immigration (ICW)", "RRP Thermostat", "Progressive Thermostat", "Redistribution"),
                custom.coef.names=c("Change in number of high ties", "Attending higher education", "Income (standardized)", "Total ties", "Age (in 10 years)"),
                caption.above=T,
                float.pos="htpb!",
                label="main_liss_sorting_nopartner",
                caption="LISS - The effect of a change in the education level of a tie on the change in attitudes and voting (without partner ties)",
                custom.gof.rows=list("Individual and time FE"= c("Yes", "Yes", "Yes", "Yes", "Yes"),
                                     "Num. obs" =c(eu[[3]], immi[[3]], rrp[[3]], gl[[3]], redis[[3]]),
                                     "Num. Respondents" =c(eu[[2]], immi[[2]], rrp[[2]], gl[[2]], redis[[2]])),
                custom.note="\\item %stars.  The table shows that a change in the number of higher-educated ties someone has influences political attitudes and behavior. Two-way fixed effects models with standard errors clustered at the respondent level. The outcome variables are a 5-step Likert scale for EU and redistribution, an ICW index for immigration and Gender (0-1), and 11-step thermostat scales for the PVV and GL + D66.")

model
print(model, file = "LISS/tables/taba9_main_liss_sorting_nopartner.tex")
# Table a9













### Applying DiD2s for the main model (with a binary treatment)
did2s_ties_eu <- did2s(full_panel,
                       yname="eu_progressive_high", first_stage = ~ 0 | nomem_encr + year,
                       second_stage = ~higher_edu_atten, treatment = "has_highties", 
                       cluster_var = "nomem_encr")
did2s_ties_immi <- did2s(full_panel,
                         yname="immigration_icw_outcome", first_stage = ~ 0 | nomem_encr + year,
                         second_stage = ~higher_edu_atten, treatment = "has_highties", 
                         cluster_var = "nomem_encr")
did2s_ties_pvv <- did2s(full_panel,
                        yname="term_pvv", first_stage = ~ 0 | nomem_encr + year,
                        second_stage = ~higher_edu_atten, treatment = "has_highties", 
                        cluster_var = "nomem_encr")
did2s_ties_d66gl <- did2s(full_panel,
                          yname="term_d66gl", first_stage = ~ 0 | nomem_encr + year,
                          second_stage = ~higher_edu_atten, treatment = "has_highties", 
                          cluster_var = "nomem_encr")
did2s_ties_redis <- did2s(full_panel,
                          yname="redis", first_stage = ~ 0 | nomem_encr + year,
                          second_stage = ~higher_edu_atten, treatment = "has_highties", 
                          cluster_var = "nomem_encr")


model <- texreg(l=list(did2s_ties_eu, did2s_ties_immi, did2s_ties_pvv, did2s_ties_d66gl, did2s_ties_redis), 
                use.packages=F,
                stars=c(0.10, 0.05, 0.01), 
                digits=3, include.ci = FALSE, booktabs=T,
                custom.model.names = c("DV: EU", "Immigration ICW", "RRP", "Progressive", "Redistribution"),
                custom.coef.names = c("Having high ties (0-1)"),
                caption.above=T,
                float.pos="htpb!",
                label="main_liss_sorting_did2s",
                caption="LISS - Effect of having higher-educated ties using DiD2s",
                threeparttable = TRUE,
                custom.note="\\item %stars. Standard Errors are clustered at the respondent level. The models use the DiD2s estimator to deal with bias caused by effect heterogeneity.")

print(model, file = "LISS/tables/taba12_main_liss_sorting_did2s.tex")
# Table a12
















######### Count proportion of treatment changes that are gaining a tie or losing a tie
full_panel_check <- full_panel %>% drop_na(higher_edu_atten, nettoink_stand, total_ties, age10, 
                                           term_pvv) %>% #ensure it's the same dataset
  filter(total_ties > 0 & age >18) %>% 
  arrange(nomem_encr, year)

# Add some key values
full_panel_check <- full_panel_check %>%
  group_by(nomem_encr) %>%
  mutate(
    # Create lagged variable for comparison
    prev_year_value = dplyr::lag(number_high_ties),
    
    # Correctly identify if there's a difference from the previous year
    diff_prev_year = if_else(!is.na(prev_year_value) & number_high_ties != prev_year_value, 1, 0, missing = 0),
    
    # Identify if the value was higher in the previous year
    higher_prev_year = if_else(!is.na(prev_year_value) & prev_year_value > number_high_ties, 1, 0, missing = 0),
    
    # Identify if the value was lower in the previous year
    lower_prev_year = if_else(!is.na(prev_year_value) & prev_year_value < number_high_ties, 1, 0, missing = 0)
  ) %>%
  ungroup()

# Calculate the total number of cases for each condition
summary_counts <- full_panel_check %>%
  summarize(
    total_changes = sum(diff_prev_year, na.rm = TRUE), # Total number of changes
    total_higher = sum(higher_prev_year, na.rm = TRUE), # Cases where value is higher than previous year
    total_lower = sum(lower_prev_year, na.rm = TRUE) # Cases where value is lower than previous year
  )
print(summary_counts)
rm(summary_counts)



# Calculate the total number of changes for people that experience a change in their ties. 
full_panel_check2 <- full_panel_check %>% 
  group_by(nomem_encr) %>%
  mutate(average_change = mean(diff_prev_year, na.rm = TRUE)) %>%
  ungroup()

full_panel_check2 %>% filter(average_change != 0) %>% group_by(nomem_encr) %>% 
  mutate(total_changes=sum(diff_prev_year,na.rm=T)) %>% 
  ungroup() %>% distinct(nomem_encr, .keep_all=T) %>% 
  summarize(mean(total_changes),
            sd(total_changes))
rm(full_panel_check2)




#### Calculate the percentage of people with at least one change in ties and average changes among people who have changes
# Add the mean change in ties for all people
full_panel_check <- full_panel_check %>% group_by(nomem_encr) %>%
  mutate(average_change = mean(diff_prev_year, na.rm = TRUE)) %>% ungroup()

# Calculate the percentage of people with at least one change in ties
full_panel_check <- full_panel_check %>% group_by(nomem_encr) %>%
  mutate(has_tie_change = ifelse(average_change >0, 1, 0)) %>% ungroup() 

full_panel_check %>% distinct(nomem_encr,.keep_all=T) %>% 
  summarize(percentage = mean(has_tie_change,na.rm=T)) %>%
  pull(percentage) # 53% of people have at least one tie change

# Among people that experience a change, how common is this
full_panel_check %>% filter(has_tie_change==1) %>% distinct(nomem_encr,.keep_all=T) %>%
  summarize(mean_changes = mean(average_change),
            sd_cheanges = sd(average_change))

rm(person_averages, percentage_non_zero_change, full_panel_check) 










































################ Case Control Method without any family---------------------------------------------------------
## Table 2 is created with the code below. 


# Run code up to line 1000 first before running this. It cleans the data

##### Case control method for only friends (excluding partners, parents, and siblings)
## Create case variables One row for each tie 
case_df <- full_panel  %>% 
  filter(age >= 25 & total_ties >= 1 & year >= 14) %>% # we want only those above 25. We take all ties except parents (like in all the models in the paper)
  group_by(nomem_encr) %>%
  slice_tail(n = 1) %>% # If choosing the last two waves, this keeps the last observation of each individual. Keep the last observation from each individual (the tail because the data is sorted from earliest to last waves)
  ungroup() %>% group_by(nohouse_encr) %>%
  slice_sample(n = 1) %>% # Keep one random person per household
  ungroup() %>%
  dplyr::rename(age_res = age,
                education_res = education,
                generation_res = generation) %>%
  dplyr::select(nomem_encr, nohouse_encr, year, higher_edu, education_res, age_res, gender, migration_background, generation_res,
         pers1_know, pers1_edu,
         pers2_know, pers2_edu,
         pers3_know, pers3_edu,
         pers4_know, pers4_edu,
         pers5_know, pers5_edu,
         pers1_age, pers2_age,
         pers3_age, pers4_age,
         pers5_age,
         pers1_gend, pers2_gend,
         pers3_gend, pers4_gend,
         pers5_gend, 
         pers1_ethn, pers2_ethn, 
         pers3_ethn, pers4_ethn,
         pers5_ethn)


case_df <- case_df %>%
  pivot_longer(
    cols = starts_with("pers"),
    names_to = c("person", ".value"),
    names_pattern = "pers(\\d+)_(.+)"
  ) %>% drop_na(know)


# Drop cases with education of tie unknown or still in school (these are also dropped further up in the code)
case_df <- case_df  %>% filter(edu > 1 & edu < 8)

# Drop anyone that's family
case_df <- case_df %>% filter(know> 5)  # 2 is parents (these drop out already due to NA in education tie), Greater than 5 is without any family or partner


# Add that these are all ties
case_df$tie <- 1 # All these are cases with a tie

# Add education distance
case_df$higher_edu_tie <- ifelse(case_df$edu > 5, 1, 0) #whether a tie is higher educated
case_df$education_distance <- ifelse(case_df$higher_edu != case_df$higher_edu_tie, 1, 0)

# Add continuous education distance
case_df$education_distance_cont <- abs((case_df$edu - 1) - case_df$education_res)

# Add gender distance
case_df$gender_distance <- ifelse(case_df$gender != case_df$gend, 1, 0)

# Add ethnicity distance
case_df$ethn[case_df$ethn<1] <- NA
case_df$ethn[case_df$ethn==1] <- 0
case_df$ethn[case_df$ethn>1] <- 1
case_df$ethn_distance <- ifelse(case_df$migration_background != case_df$ethn, 1, 0)

# Add Age distance (by 15 years)
case_df$age[case_df$age<1 | case_df$age==14] <- NA
case_df$age[case_df$age <= 4] <- 1 # 16 to 30
case_df$age[case_df$age > 4 & case_df$age <= 7] <- 2 # 30 to 45
case_df$age[case_df$age > 4 & case_df$age <= 10] <- 3 # 45 to 60
case_df$age[case_df$age > 10] <- 4 # 45 to 60

case_df$age_res_group <- NA
case_df$age_res_group[case_df$age_res <= 30] <- 1
case_df$age_res_group[case_df$age_res > 30 & case_df$age_res <= 45] <- 2
case_df$age_res_group[case_df$age_res > 45 & case_df$age_res <= 60] <- 3
case_df$age_res_group[case_df$age_res > 60] <- 4
case_df$age_distance <- ifelse(case_df$age_res_group != case_df$age, 1, 0)


#drop NA cases where education tie or other characteristics of the tie are not reported
hold_df <- case_df %>% drop_na(education_distance, ethn_distance, age_distance, gender_distance, nohouse_encr) 
case_df <- hold_df %>%  drop_na(education_distance, ethn_distance, age_distance, gender_distance, nohouse_encr) %>% 
 dplyr::select(-nohouse_encr, -edu, -person, -know, -nomem_encr, -age_res,
         -age, -gend, -ethn,
         -age_res_group, -year)




####### Create the control DF by randomly switching ties
### We take the case df, this ensures that we have exactly the same respondents we are randomly matching with each other
full_panel_save <- hold_df %>%  distinct(nomem_encr, year, .keep_all=T) %>%
 dplyr::select(-education_distance, -person, -know, -edu, -age, -gend, -ethn, -tie, -higher_edu_tie,
         -education_distance_cont, -gender_distance, -ethn_distance, -age_res_group, -age_distance)

# We create one million random matches from the case df
n <- 1000000
set.seed(123)
nomem_encr_year <- full_panel_save %>% 
  pull(nomem_encr)


nomem_encr <- sample(nomem_encr_year, size = n, replace = T) # Random person
nomem_encr_tie <- sample(nomem_encr_year, size = n, replace = T) # with a random other person as tie 

# Combine the samples with the year into a temporary dataframe
control_df <- data.frame(nomem_encr = nomem_encr, nomem_encr_tie = nomem_encr_tie)


### Take the information belonging to the nomem_encr to merge into the control_df
#   Also DROPS people for whom we have missing demographic information. This ensures that for none of the simulated fake ties there is a missing distance
sub <- full_panel_save %>%dplyr::select(nomem_encr, nohouse_encr, higher_edu, education_res, age_res, gender, migration_background, generation_res)  %>%
  drop_na(nomem_encr, education_res, higher_edu, age_res, gender, migration_background)

control_df <- left_join(control_df, sub) #join in info for respondents


# Now take the information for the control df ties
sub <- sub %>% rename(nomem_encr_tie=nomem_encr, nohouse_encr_tie=nohouse_encr, higher_edu_tie=higher_edu, education_tie=education_res, #rename so you can join in for ties
                      age_tie=age_res, gender_tie=gender, migration_background_tie=migration_background) %>% dplyr::select(-generation_res) #not calculating distance for generation so can drop it
control_df <- left_join(control_df, sub) #join in info for simulated ties
rm(sub)


# Remove cases where respondents were matched with themselves
control_df$selfmatch <- ifelse(control_df$nomem_encr == control_df$nomem_encr_tie, 1, 0)
table(control_df$selfmatch)

# Remove cases where respondents were matched with someone from their own household (these dont exist anymore as its one person per household)
control_df$hhmatch <- ifelse(control_df$nohouse_encr == control_df$nohouse_encr_tie, 1, 0)
table(control_df$hhmatch)


# Clean and prepare to match in with the rest of the DF
control_df <- control_df %>% filter(selfmatch==0 & hhmatch==0) %>% 
 dplyr::select(-selfmatch, -nomem_encr, -nomem_encr_tie) %>%
  rename()

control_df$tie <- 0 # None are ties

## Add the distance variables
# Education
control_df$education_distance <- ifelse(control_df$higher_edu != control_df$higher_edu_tie, 1, 0)

# Continuous education
control_df$education_distance_cont <- abs(control_df$education_res - control_df$education_tie)


# Add gender distance
control_df$gender_distance <- ifelse(control_df$gender != control_df$gender_tie, 1, 0)

# Add ethnicity distance
control_df$ethn_distance <- ifelse(control_df$migration_background != control_df$migration_background_tie, 1, 0) # migration background uses CBS definition (both first and second gen immigrant). The tie variable is unclear, it just asks about 'origin'

# Add Age distance (by 15 years)
control_df$age_res[control_df$age_res <= 30] <- 1
control_df$age_res[control_df$age_res > 30 & control_df$age_res <= 45] <- 2
control_df$age_res[control_df$age_res > 45 & control_df$age_res <= 60] <- 3
control_df$age_res[control_df$age_res > 60] <- 4

control_df$age_tie[control_df$age_tie <= 30] <- 1
control_df$age_tie[control_df$age_tie > 30 & control_df$age_tie <= 45] <- 2
control_df$age_tie[control_df$age_tie > 45 & control_df$age_tie <= 60] <- 3
control_df$age_tie[control_df$age_tie > 60] <- 4
control_df$age_distance <- ifelse(control_df$age_res != control_df$age_tie, 1, 0)


control_df <- control_df %>% dplyr::select(-age_res, -age_tie, -gender_tie, -education_tie,
                                           -migration_background_tie, -nohouse_encr, -nohouse_encr_tie, -hhmatch)


# Rename the generation variable so that it becomes a bit easier to use
control_df$generation_res <- as.character(control_df$generation_res)
control_df$generation_res[control_df$generation_res=="1"] <- "45 or before"
control_df$generation_res[control_df$generation_res=="2"] <- "46 - 55"
control_df$generation_res[control_df$generation_res=="3"] <- "56 - 65"
control_df$generation_res[control_df$generation_res=="4"] <- "66 - 75"
control_df$generation_res[control_df$generation_res=="5"] <- "76 - 85"
control_df$generation_res[control_df$generation_res=="6"] <- "86 - 95"
control_df$generation_res[control_df$generation_res=="7"] <- "96 or later"

# merge together last two groups so N is large enough
control_df$generation_res[control_df$generation_res=="96 or later"] <- "86 - 95"
control_df$generation_res[control_df$generation_res=="86 - 95"] <- "86 or later"


# Rename the generation variable so that it becomes a bit easier to use
case_df$generation_res <- as.character(case_df$generation_res)
case_df$generation_res[case_df$generation_res=="1"] <- "45 or before"
case_df$generation_res[case_df$generation_res=="2"] <- "46 - 55"
case_df$generation_res[case_df$generation_res=="3"] <- "56 - 65"
case_df$generation_res[case_df$generation_res=="4"] <- "66 - 75"
case_df$generation_res[case_df$generation_res=="5"] <- "76 - 85"
case_df$generation_res[case_df$generation_res=="6"] <- "86 - 95"
case_df$generation_res[case_df$generation_res=="7"] <- "96 or later"

# merge together last two groups so N is large enough
case_df$generation_res[case_df$generation_res=="96 or later"] <- "86 - 95"
case_df$generation_res[case_df$generation_res=="86 - 95"] <- "86 or later"




# Merge case and control together a nd subset the df
final_df <- bind_rows(control_df, case_df)

final_df_he <- final_df %>% filter(higher_edu==1)
final_df_le <- final_df %>% filter(higher_edu==0)

final_df_women <- final_df %>% filter(gender==2)
final_df_men <- final_df %>% filter(gender==1)

final_df_mb <- final_df %>% filter(migration_background==1)
final_df_nomb <- final_df %>% filter(migration_background==0)





### Run the case control models
cc_cont <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df, family="binomial")
cc_edu_high_cont <-  glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_he, family="binomial")
cc_edu_low_cont <-  glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_le, family="binomial")
cc_women_cont <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_women, family="binomial")
cc_men_cont <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_men, family="binomial")
cc_mb_cont <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_mb, family="binomial")
cc_nomb_cont <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_nomb, family="binomial")





######## Get the SEs through the bootstrapping approach
# get the right N, which is the number of respondents we actually have
meannyear <- full_panel_save %>% distinct(nomem_encr) %>% summarize(n()) %>% summarize(mean(`n()`)) %>% pull(`mean(\`n()\`)`)

## In a loop
n_iterations <- 1000

# Initialize lists to store coefficients for each model
coefficients_casecont_cont_sim <- list()
coefficients_casecont_edu_high_cont_sim <- list()
coefficients_casecont_edu_low_cont_sim <- list()
coefficients_casecont_women_cont_sim <- list()
coefficients_casecont_men_cont_sim <- list()
coefficients_casecont_mb_cont_sim <- list()
coefficients_casecont_nomb_cont_sim <- list()







set.seed(123)
# Loop through n iterations
for(i in 1:n_iterations) {
  # Randomly sample observations based on meannyear
  sample_case_df <- case_df[sample(nrow(case_df), meannyear), ]
  sample_control_df <- control_df[sample(nrow(control_df), meannyear), ]
  
  # Combine sampled dataframes
  final_df_loop <- rbind(sample_case_df, sample_control_df)
  
  # Subset the dataframe 
  final_df_loop_he <- subset(final_df_loop, higher_edu == 1)
  final_df_loop_le <- subset(final_df_loop, higher_edu == 0)
  final_df_loop_women <- final_df_loop %>% filter(gender==2)
  final_df_loop_men <- final_df_loop %>% filter(gender==1)
  
  final_df_loop_mb <- final_df_loop %>% filter(migration_background==1)
  final_df_loop_nomb <- final_df_loop %>% filter(migration_background==0)
  
  # Run the seven models
  casecont_cont_sim <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_loop, family="binomial")
  casecont_edu_high_cont_sim <-  glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_loop_he, family="binomial")
  casecont_edu_low_cont_sim <-  glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_loop_le, family="binomial")
  casecont_women_cont_sim <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_loop_women, family="binomial")
  casecont_men_cont_sim <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_loop_men, family="binomial")
  casecont_mb_cont_sim <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_loop_mb, family="binomial")
  casecont_nomb_cont_sim <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_loop_nomb, family="binomial")
  
  # Store coefficients in respective lists
  coefficients_casecont_cont_sim[[i]] <- (coef(casecont_cont_sim))
  coefficients_casecont_edu_high_cont_sim[[i]] <- (coef(casecont_edu_high_cont_sim))
  coefficients_casecont_edu_low_cont_sim[[i]] <- (coef(casecont_edu_low_cont_sim))
  coefficients_casecont_women_cont_sim[[i]] <- (coef(casecont_women_cont_sim))
  coefficients_casecont_men_cont_sim[[i]] <- (coef(casecont_men_cont_sim))
  coefficients_casecont_mb_cont_sim[[i]] <- (coef(casecont_mb_cont_sim))
  coefficients_casecont_nomb_cont_sim[[i]] <- (coef(casecont_nomb_cont_sim))
  
}

# Function to calculate standard deviation of coefficients
calc_std_dev <- function(coef_list) {
  # Convert list to dataframe
  coef_df <- do.call(rbind, coef_list)
  # Calculate standard deviation for each coefficient
  apply(coef_df, 2, sd)
}

# Calculate standard deviations for each model
se_cont_sim <- calc_std_dev(coefficients_casecont_cont_sim)
se_edu_high_cont_sim <- calc_std_dev(coefficients_casecont_edu_high_cont_sim)
se_edu_low_cont_sim <- calc_std_dev(coefficients_casecont_edu_low_cont_sim)
se_women_cont_sim <- calc_std_dev(coefficients_casecont_women_cont_sim)
se_men_cont_sim <- calc_std_dev(coefficients_casecont_men_cont_sim)
se_mb_cont_sim <- calc_std_dev(coefficients_casecont_mb_cont_sim)
se_nomb_cont_sim <- calc_std_dev(coefficients_casecont_nomb_cont_sim)




## Use stargazer to create a table. As opposed to texreg it allows you to enter custom SEs
library(stargazer)


# Create stargazer table. Somehow you need to do this in two goes due to old un-updated package. Chatgpt can merge the tables, also note that you manually have to add the table notes in the three-part table format
stargazer(cc_cont, cc_edu_high_cont, cc_edu_low_cont,
          se = list(se_cont_sim, se_edu_high_cont_sim, se_edu_low_cont_sim),
          type = "latex",  # Output type
          title = "LISS - Case Control Analysis - Non-kin ties",  # Title of the table
          header = FALSE,  # Do not include R-squared and other statistics typically at the header
          model.names = FALSE,  # Do not include default model names
          covariate.labels = c("Education Distance", "Ethnicity Distance", "Gender Distance", "Age Distance"),
          omit.stat = c("all"),  # Omit all additional statistics like ll, AIC, BIC, etc.
          star.cutoffs = c(0.05, 0.01, 0.001),  # Specify star cutoffs
          omit = "Constant",  # Omit the intercept from the table
          notes = "DV: whether there is a tie between two people. Education distance is 1 if a respondent and the comparison person do not have the same education level. Ethnicity distance is 1 if ties don't have the same migration background. Gender distance is 1 if ties are not the same gender. Age distance is 1 if ties are more than 15 years apart. Logistic regression results are reported in log-odds. The intercept is not shown.",
          notes.align = "l",  # Align notes to the left
          label = "tab:case_control_analysis",  # LaTeX label for referencing
          column.labels = c("Full Sample" ,"High Edu", "Low Edu"))


stargazer(cc_women_cont, cc_men_cont, cc_mb_cont, cc_nomb_cont,
          se = list(se_women_cont_sim, se_men_cont_sim, se_mb_cont_sim, se_nomb_cont_sim),
          type = "latex",  # Output type
          title = "LISS - Case Control Analysis - Non-kin ties",  # Title of the table
          header = FALSE,  # Do not include R-squared and other statistics typically at the header
          model.names = FALSE,  # Do not include default model names
          covariate.labels = c("Education Distance", "Ethnicity Distance", "Gender Distance", "Age Distance"),
          omit.stat = c("all"),  # Omit all additional statistics like ll, AIC, BIC, etc.
          star.cutoffs = c(0.05, 0.01, 0.001),  # Specify star cutoffs
          omit = "Constant",  # Omit the intercept from the table
          notes = "DV: whether there is a tie between two people. Education distance is 1 if a respondent and the comparison person do not have the same education level. Ethnicity distance is 1 if ties don't have the same migration background. Gender distance is 1 if ties are not the same gender. Age distance is 1 if ties are more than 15 years apart. Logistic regression results are reported in log-odds. The intercept is not shown.",
          notes.align = "l",  # Align notes to the left
          label = "tab:case_control_analysis",  # LaTeX label for referencing
          column.labels = c("Women" ,"Men", "Minorities", "Non-minority"))



























################ Case Control Method with all the ties ---------------------------------------------------------
# Run code up to line 1000 first before running this. It cleans the data

##### Case control method for only friends (excluding partners, parents, and siblings)
## Create case variables One row for each tie 
case_df <- full_panel  %>% 
  filter(age > 26 & total_ties >= 1 & year >= 14) %>% # we want only those above 25. We take all ties except parents (like in all the models in the paper)
  group_by(nomem_encr) %>%
  slice_tail(n = 1) %>% # If choosing the last two waves, this keeps the last observation of each individual. Keep the last observation from each individual (the tail because the data is sorted from earliest to last waves)
  ungroup() %>% group_by(nohouse_encr) %>%
  slice_sample(n = 1) %>% # Keep one random person per household
  ungroup() %>%
  dplyr::rename(age_res = age,
                education_res = education,
                generation_res = generation) %>%
 dplyr::select(nomem_encr, nohouse_encr, year, higher_edu, education_res, age_res, gender, migration_background, generation_res,
         pers1_know, pers1_edu_wpare,
         pers2_know, pers2_edu_wpare,
         pers3_know, pers3_edu_wpare,
         pers4_know, pers4_edu_wpare,
         pers5_know, pers5_edu_wpare,
         pers1_age, pers2_age,
         pers3_age, pers4_age,
         pers5_age,
         pers1_gend, pers2_gend,
         pers3_gend, pers4_gend,
         pers5_gend, 
         pers1_ethn, pers2_ethn, 
         pers3_ethn, pers4_ethn,
         pers5_ethn) %>%
  rename(pers1_edu = pers1_edu_wpare, #rename so that I can keep the rest of the code the same
         pers2_edu = pers2_edu_wpare,
         pers3_edu = pers3_edu_wpare,
         pers4_edu = pers4_edu_wpare,
         pers5_edu = pers5_edu_wpare)


case_df <- case_df %>%
  pivot_longer(
    cols = starts_with("pers"),
    names_to = c("person", ".value"),
    names_pattern = "pers(\\d+)_(.+)"
  ) %>% drop_na(know)


# Drop cases with education of tie unknown or still in school (these are also dropped further up in the code)
case_df <- case_df  %>% filter(edu > 1 & edu < 8)

# Add that these are all ties
case_df$tie <- 1 # All these are cases with a tie

# Add education distance
case_df$higher_edu_tie <- ifelse(case_df$edu > 5, 1, 0) #whether a tie is higher educated
case_df$education_distance <- ifelse(case_df$higher_edu != case_df$higher_edu_tie, 1, 0)

# Add continuous education distance
case_df$education_distance_cont <- abs((case_df$edu - 1) - case_df$education_res)

# Add gender distance
case_df$gender_distance <- ifelse(case_df$gender != case_df$gend, 1, 0)

# Add ethnicity distance
case_df$ethn[case_df$ethn<1] <- NA
case_df$ethn[case_df$ethn==1] <- 0
case_df$ethn[case_df$ethn>1] <- 1
case_df$ethn_distance <- ifelse(case_df$migration_background != case_df$ethn, 1, 0)

# Add Age distance (by 15 years)
case_df$age[case_df$age<1 | case_df$age==14] <- NA
case_df$age[case_df$age <= 4] <- 1 # 16 to 30
case_df$age[case_df$age > 4 & case_df$age <= 7] <- 2 # 30 to 45
case_df$age[case_df$age > 4 & case_df$age <= 10] <- 3 # 45 to 60
case_df$age[case_df$age > 10] <- 4 # 45 to 60

case_df$age_res_group <- NA
case_df$age_res_group[case_df$age_res <= 30] <- 1
case_df$age_res_group[case_df$age_res > 30 & case_df$age_res <= 45] <- 2
case_df$age_res_group[case_df$age_res > 45 & case_df$age_res <= 60] <- 3
case_df$age_res_group[case_df$age_res > 60] <- 4
case_df$age_distance <- ifelse(case_df$age_res_group != case_df$age, 1, 0)


#drop NA cases where education tie or other characteristics of the tie are not reported
hold_df <- case_df %>% drop_na(education_distance, ethn_distance, age_distance, gender_distance, nohouse_encr) 
case_df <- hold_df %>%  drop_na(education_distance, ethn_distance, age_distance, gender_distance, nohouse_encr) %>% 
 dplyr::select(-nohouse_encr, -edu, -person, -know, -nomem_encr, -age_res,
         -age, -gend, -ethn,
         -age_res_group, -year)




####### Create the control DF by randomly switching ties
### We take the case df, this ensures that we have exactly the same respondents we are randomly matching with each other
full_panel_save <- hold_df %>%  distinct(nomem_encr, year, .keep_all=T) %>%
 dplyr::select(-education_distance, -person, -know, -edu, -age, -gend, -ethn, -tie, -higher_edu_tie,
         -education_distance_cont, -gender_distance, -ethn_distance, -age_res_group, -age_distance)

# We create one million random matches from the case df
n <- 1000000

nomem_encr_year <- full_panel_save %>% 
  pull(nomem_encr)


nomem_encr <- sample(nomem_encr_year, size = n, replace = T) # Random person
nomem_encr_tie <- sample(nomem_encr_year, size = n, replace = T) # with a random other person as tie 

# Combine the samples with the year into a temporary dataframe
control_df <- data.frame(nomem_encr = nomem_encr, nomem_encr_tie = nomem_encr_tie)


### Take the information belonging to the nomem_encr to merge into the control_df
#   Also DROPS people for whom we have missing demographic information. This ensures that for none of the simulated fake ties there is a missing distance
sub <- full_panel_save %>%dplyr::select(nomem_encr, nohouse_encr, higher_edu, education_res, age_res, gender, migration_background, generation_res)  %>%
  drop_na(nomem_encr, education_res, higher_edu, age_res, gender, migration_background)

control_df <- left_join(control_df, sub) #join in info for respondents


# Now take the information for the control df ties
sub <- sub %>% rename(nomem_encr_tie=nomem_encr, nohouse_encr_tie=nohouse_encr, higher_edu_tie=higher_edu, education_tie=education_res, #rename so you can join in for ties
                      age_tie=age_res, gender_tie=gender, migration_background_tie=migration_background) %>% dplyr::select(-generation_res) #not calculating distance for generation so can drop it
control_df <- left_join(control_df, sub) #join in info for simulated ties
rm(sub)


# Remove cases where respondents were matched with themselves
control_df$selfmatch <- ifelse(control_df$nomem_encr == control_df$nomem_encr_tie, 1, 0)
table(control_df$selfmatch)

# Remove cases where respondents were matched with someone from their own household (these dont exist anymore as its one person per household)
control_df$hhmatch <- ifelse(control_df$nohouse_encr == control_df$nohouse_encr_tie, 1, 0)
table(control_df$hhmatch)


# Clean and prepare to match in with the rest of the DF
control_df <- control_df %>% filter(selfmatch==0 & hhmatch==0) %>% 
 dplyr::select(-selfmatch, -nomem_encr, -nomem_encr_tie) %>%
  rename()

control_df$tie <- 0 # None are ties

## Add the distance variables
# Education
control_df$education_distance <- ifelse(control_df$higher_edu != control_df$higher_edu_tie, 1, 0)

# Continuous education
control_df$education_distance_cont <- abs(control_df$education_res - control_df$education_tie)


# Add gender distance
control_df$gender_distance <- ifelse(control_df$gender != control_df$gender_tie, 1, 0)

# Add ethnicity distance
control_df$ethn_distance <- ifelse(control_df$migration_background != control_df$migration_background_tie, 1, 0) # migration background uses CBS definition (both first and second gen immigrant). The tie variable is unclear, it just asks about 'origin'

# Add Age distance (by 15 years)
control_df$age_res[control_df$age_res <= 30] <- 1
control_df$age_res[control_df$age_res > 30 & control_df$age_res <= 45] <- 2
control_df$age_res[control_df$age_res > 45 & control_df$age_res <= 60] <- 3
control_df$age_res[control_df$age_res > 60] <- 4

control_df$age_tie[control_df$age_tie <= 30] <- 1
control_df$age_tie[control_df$age_tie > 30 & control_df$age_tie <= 45] <- 2
control_df$age_tie[control_df$age_tie > 45 & control_df$age_tie <= 60] <- 3
control_df$age_tie[control_df$age_tie > 60] <- 4
control_df$age_distance <- ifelse(control_df$age_res != control_df$age_tie, 1, 0)


control_df <- control_df %>% dplyr::select(-age_res, -age_tie, -gender_tie, -education_tie,
                                           -migration_background_tie, -nohouse_encr, -nohouse_encr_tie, -hhmatch)


# Rename the generation variable so that it becomes a bit easier to use
control_df$generation_res <- as.character(control_df$generation_res)
control_df$generation_res[control_df$generation_res=="1"] <- "45 or before"
control_df$generation_res[control_df$generation_res=="2"] <- "46 - 55"
control_df$generation_res[control_df$generation_res=="3"] <- "56 - 65"
control_df$generation_res[control_df$generation_res=="4"] <- "66 - 75"
control_df$generation_res[control_df$generation_res=="5"] <- "76 - 85"
control_df$generation_res[control_df$generation_res=="6"] <- "86 - 95"
control_df$generation_res[control_df$generation_res=="7"] <- "96 or later"

# merge together last two groups so N is large enough
control_df$generation_res[control_df$generation_res=="96 or later"] <- "86 - 95"
control_df$generation_res[control_df$generation_res=="86 - 95"] <- "86 or later"


# Rename the generation variable so that it becomes a bit easier to use
case_df$generation_res <- as.character(case_df$generation_res)
case_df$generation_res[case_df$generation_res=="1"] <- "45 or before"
case_df$generation_res[case_df$generation_res=="2"] <- "46 - 55"
case_df$generation_res[case_df$generation_res=="3"] <- "56 - 65"
case_df$generation_res[case_df$generation_res=="4"] <- "66 - 75"
case_df$generation_res[case_df$generation_res=="5"] <- "76 - 85"
case_df$generation_res[case_df$generation_res=="6"] <- "86 - 95"
case_df$generation_res[case_df$generation_res=="7"] <- "96 or later"

# merge together last two groups so N is large enough
case_df$generation_res[case_df$generation_res=="96 or later"] <- "86 - 95"
case_df$generation_res[case_df$generation_res=="86 - 95"] <- "86 or later"




# Merge case and control together a nd subset the df
final_df <- bind_rows(control_df, case_df)

final_df_he <- final_df %>% filter(higher_edu==1)
final_df_le <- final_df %>% filter(higher_edu==0)

final_df_women <- final_df %>% filter(gender==2)
final_df_men <- final_df %>% filter(gender==1)

final_df_mb <- final_df %>% filter(migration_background==1)
final_df_nomb <- final_df %>% filter(migration_background==0)





### Run the case control models
cc_cont <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df, family="binomial")
cc_edu_high_cont <-  glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_he, family="binomial")
cc_edu_low_cont <-  glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_le, family="binomial")
cc_women_cont <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_women, family="binomial")
cc_men_cont <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_men, family="binomial")
cc_mb_cont <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_mb, family="binomial")
cc_nomb_cont <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_nomb, family="binomial")




######## Get the SEs through the bootstrapping approach
# get the right N, which is the number of respondents we actually have
meannyear <- full_panel_save %>% distinct(nomem_encr) %>% summarize(n()) %>% summarize(mean(`n()`)) %>% pull(`mean(\`n()\`)`)

## In a loop
n_iterations <- 1000

# Initialize lists to store coefficients for each model
coefficients_casecont_cont_sim <- list()
coefficients_casecont_edu_high_cont_sim <- list()
coefficients_casecont_edu_low_cont_sim <- list()
coefficients_casecont_women_cont_sim <- list()
coefficients_casecont_men_cont_sim <- list()
coefficients_casecont_mb_cont_sim <- list()
coefficients_casecont_nomb_cont_sim <- list()


set.seed(123)
# Loop through n iterations
for(i in 1:n_iterations) {
  # Randomly sample observations based on meannyear
  sample_case_df <- case_df[sample(nrow(case_df), meannyear), ]
  sample_control_df <- control_df[sample(nrow(control_df), meannyear), ]
  
  # Combine sampled dataframes
  final_df_loop <- rbind(sample_case_df, sample_control_df)
  
  # Subset the dataframe 
  final_df_loop_he <- subset(final_df_loop, higher_edu == 1)
  final_df_loop_le <- subset(final_df_loop, higher_edu == 0)
  final_df_loop_women <- final_df_loop %>% filter(gender==2)
  final_df_loop_men <- final_df_loop %>% filter(gender==1)
  
  final_df_loop_mb <- final_df_loop %>% filter(migration_background==1)
  final_df_loop_nomb <- final_df_loop %>% filter(migration_background==0)
  
  # Run the seven models
  casecont_cont_sim <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_loop, family="binomial")
  casecont_edu_high_cont_sim <-  glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_loop_he, family="binomial")
  casecont_edu_low_cont_sim <-  glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_loop_le, family="binomial")
  casecont_women_cont_sim <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_loop_women, family="binomial")
  casecont_men_cont_sim <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_loop_men, family="binomial")
  casecont_mb_cont_sim <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_loop_mb, family="binomial")
  casecont_nomb_cont_sim <- glm(tie~education_distance + ethn_distance + gender_distance + age_distance , data=final_df_loop_nomb, family="binomial")
  
  # Store coefficients in respective lists
  coefficients_casecont_cont_sim[[i]] <- (coef(casecont_cont_sim))
  coefficients_casecont_edu_high_cont_sim[[i]] <- (coef(casecont_edu_high_cont_sim))
  coefficients_casecont_edu_low_cont_sim[[i]] <- (coef(casecont_edu_low_cont_sim))
  coefficients_casecont_women_cont_sim[[i]] <- (coef(casecont_women_cont_sim))
  coefficients_casecont_men_cont_sim[[i]] <- (coef(casecont_men_cont_sim))
  coefficients_casecont_mb_cont_sim[[i]] <- (coef(casecont_mb_cont_sim))
  coefficients_casecont_nomb_cont_sim[[i]] <- (coef(casecont_nomb_cont_sim))
  
}

# Function to calculate standard deviation of coefficients
calc_std_dev <- function(coef_list) {
  # Convert list to dataframe
  coef_df <- do.call(rbind, coef_list)
  # Calculate standard deviation for each coefficient
  apply(coef_df, 2, sd)
}

# Calculate standard deviations for each model
se_cont_sim <- calc_std_dev(coefficients_casecont_cont_sim)
se_edu_high_cont_sim <- calc_std_dev(coefficients_casecont_edu_high_cont_sim)
se_edu_low_cont_sim <- calc_std_dev(coefficients_casecont_edu_low_cont_sim)
se_women_cont_sim <- calc_std_dev(coefficients_casecont_women_cont_sim)
se_men_cont_sim <- calc_std_dev(coefficients_casecont_men_cont_sim)
se_mb_cont_sim <- calc_std_dev(coefficients_casecont_mb_cont_sim)
se_nomb_cont_sim <- calc_std_dev(coefficients_casecont_nomb_cont_sim)




## Use stargazer to create a table. As opposed to texreg it allows you to enter custom SEs
library(stargazer)


# Create stargazer table. Somehow you need to do this in two goes due to old un-updated package. Chatgpt can merge the tables, also note that you manually have to add the table notes in the three-part table format
stargazer(cc_cont, cc_edu_high_cont, cc_edu_low_cont,
          se = list(se_cont_sim, se_edu_high_cont_sim, se_edu_low_cont_sim),
          type = "latex",  # Output type
          title = "LISS - Case Control Analysis - All ties",  # Title of the table
          header = FALSE,  # Do not include R-squared and other statistics typically at the header
          model.names = FALSE,  # Do not include default model names
          covariate.labels = c("Education Distance", "Ethnicity Distance", "Gender Distance", "Age Distance"),
          omit.stat = c("all"),  # Omit all additional statistics like ll, AIC, BIC, etc.
          star.cutoffs = c(0.05, 0.01, 0.001),  # Specify star cutoffs
          omit = "Constant",  # Omit the intercept from the table
          notes = "DV: whether there is a tie between two people. Education distance is 1 if a respondent and the comparison person do not have the same education level. Ethnicity distance is 1 if ties don't have the same migration background. Gender distance is 1 if ties are not the same gender. Age distance is 1 if ties are more than 15 years apart. Logistic regression results are reported in log-odds. The intercept is not shown.",
          notes.align = "l",  # Align notes to the left
          label = "tab:case_control_analysis",  # LaTeX label for referencing
          column.labels = c("Full Sample" ,"High Edu", "Low Edu"))


stargazer(cc_women_cont, cc_men_cont, cc_mb_cont, cc_nomb_cont,
          se = list(se_women_cont_sim, se_men_cont_sim, se_mb_cont_sim, se_nomb_cont_sim),
          type = "latex",  # Output type
          title = "LISS - Case Control Analysis - All ties",  # Title of the table
          header = FALSE,  # Do not include R-squared and other statistics typically at the header
          model.names = FALSE,  # Do not include default model names
          covariate.labels = c("Education Distance", "Ethnicity Distance", "Gender Distance", "Age Distance"),
          omit.stat = c("all"),  # Omit all additional statistics like ll, AIC, BIC, etc.
          star.cutoffs = c(0.05, 0.01, 0.001),  # Specify star cutoffs
          omit = "Constant",  # Omit the intercept from the table
          notes = "DV: whether there is a tie between two people. Education distance is 1 if a respondent and the comparison person do not have the same education level. Ethnicity distance is 1 if ties don't have the same migration background. Gender distance is 1 if ties are not the same gender. Age distance is 1 if ties are more than 15 years apart. Logistic regression results are reported in log-odds. The intercept is not shown.",
          notes.align = "l",  # Align notes to the left
          label = "tab:case_control_analysis",  # LaTeX label for referencing
          column.labels = c("Women" ,"Men", "Minorities", "Non-minority"))








  
  
  
  
  
  
  
  

